In [1]:
import os
import nltk
import stringimport numpy as np
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter
from torch.autograd import Variable
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# Here we download the data and split it into words, still need to get rid of punctuation

nltk.download('punkt')

documents = {}
labels = {}

# open main data
for file in os.listdir("/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"):
    if file.endswith(".abstr"):
        content = open(("%s/%s" % ('/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training', file)), "r").read()
        documents[file.split('.')[0]] = content.split('. ')
        
# open labels        
for file in os.listdir("/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"):
    if file.endswith(".uncontr"):
        content = open(("%s/%s" % ('/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training', file)), "r").read()
        labels[file.split('.')[0]] = content.split("; ")       

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# tokenize document
tokenized_documents = {}
for num, ctt in documents.items():
    tokenized_documents[num] = []
    for sentence in ctt:
        words = nltk.word_tokenize(sentence.lower())
        tokenized_documents[num].append(words)

# tokenize labels
tokenized_labels = {}
for num, ctt in labels.items():
    tokenized_labels[num] = []
    for sentence in ctt:
        words = nltk.word_tokenize(sentence.lower())
        tokenized_labels[num].append(words)        

In [4]:
# Here we preprocess labels: we label each word in each sentence with "no" if it is not a key-phrase word, 
# with "first" if it is a first word in key-phrase and "inside" if it is not a first word in key-phrase.  

# function finds index of element in list of lists
def index(lab, target):
    for i,phr in enumerate(lab):
        for j, w in enumerate(phr):
            if w == target:
                return (j)
    return (None, None)

# create dictionary of labels associated to words
class_labels = copy.deepcopy(tokenized_documents)
for document in tokenized_documents:
    text = tokenized_documents[document]
    lab = tokenized_labels[document]
    lab_flattened = [val for sublist in lab for val in sublist]
    for i, sentence in enumerate(text): 
        for j, word in enumerate(sentence): 
            is_keyphrase = word in lab_flattened
            if is_keyphrase:
                if index(tokenized_labels[document], word) == 0:
                    class_labels[document][i][j] = 1
                else:
                    class_labels[document][i][j] = 2
            else:
                class_labels[document][i][j] = 3

# GLOVE embeddings

In [5]:
# here we download pretrained glove embeddings
import numpy as np
embeddings = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

# Data padding

In [6]:
# Here we pad our data: take maximum length of sentence (l) and if length of current sentence (n) is less => 
# we add word "PAD" l-n times to make all sentences having the same length

# our data
X = [sent for doc in copy.deepcopy(tokenized_documents).values() for sent in doc]
y = [sent for doc in copy.deepcopy(class_labels).values() for sent in doc]

# padding the data
def Padding(data, ext):
    len_max = len(max(X, key=lambda coll: len(coll)))
    for i, sentence in enumerate(data):
        len_sent = len(sentence)
        len_pad = len_max - len_sent
        sentence.extend([ext for i in range(len_pad)])
    return(data)

X_padded = Padding(X, 'PAD')
y_padded = Padding(y, 0)
X_lengths = len(X_padded[0])

In [7]:
# our vocab: all the words in all abstracts
target_vocab = set([item for sublist in X_padded for item in sublist])

# create matrix of glove vectors + random vectors for the words which are in vocab but not in glove
# this matrix is used in the embedding layer

matrix_len = len(target_vocab)
weights_glove = np.zeros((matrix_len, 100))

for i, word in enumerate(target_vocab):
    try: 
        weights_glove[i] = embeddings[word]
    except KeyError:
        weights_glove[i] = np.random.normal(scale=0.6, size=(100, ))

# find which vector corresponds to "PAD":
pad_idx = list(target_vocab).index('PAD')

In [8]:
# here we replace words with their indices
for i, sent in enumerate(X_padded):
    for j, word in enumerate(sent):
        X_padded[i][j] = list(target_vocab).index(word)

# BiLSTM Net

In [39]:
class LSTM(nn.Module):
    def __init__(self, nb_layers = 1, nb_lstm_units=150, nb_lin_units=150, embedding_dim=100, batch_size=1,
                 pad_idx = pad_idx):
        super(LSTM, self).__init__()
        
        self.vocab = target_vocab
        self.tags = {'PAD': 0, 'first': 1, 'inside': 2, 'no': 3}
        
        self.nb_lstm_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.nb_lin_units = nb_lin_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        # don't count the padding tag for the classifier output
        self.nb_tags = len(self.tags) - 1


        nb_vocab_words = len(self.vocab)

        # whenever the embedding sees the padding index it'll make the whole vector zeros
        padding_idx = pad_idx
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )
        
        self.word_embedding.load_state_dict({'weight': torch.Tensor(weights_glove)})
        self.word_embedding.weight.requires_grad = False

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True,
        )

        # output layer which projects back to tag space
        self.lin1 = nn.Linear(self.nb_lstm_units, self.nb_lin_units)
        self.lin2 = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units).to(device)
        hidden_b = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units).to(device)
        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden()
        
        X_len = torch.LongTensor([43]).to(device) # need to compute  = list of sentence lengths before padding, for ex [8,4,6]

        # 1. embed the input
        X = self.word_embedding(X)
        batch_size, seq_len, _ = X.size()

        # 2. Run through RNN
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        X = torch.nn.utils.rnn.pack_padded_sequence(X, X_len, batch_first=True)

        # now run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # 3. Project to tag space
        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        # run through actual linear layer
        X = F.relu(self.lin1(X))
        X = F.relu(self.lin2(X))

        # I like to reshape for mental sanity so we're back to (batch_size, seq_len, nb_tags)
        X = X.view(batch_size, self.nb_tags, X_len)

        Y_hat = X
        return Y_hat

In [40]:
net = LSTM().to(device)
# here we define our loss function and optimizer
criterion= torch.nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [41]:
# training function should be here 
# Something like 

def train(epoch):
    train_loss = 0.0
    total = 0
    correct = 0
    for i, sent in enumerate(X_padded):
        # get sequence of inputs and labels from the data
        inputs = torch.LongTensor(sent).view([1, 125]).to(device)
        labels = torch.LongTensor(y_padded[i]).view([1, 125]).to(device)     
        # zero the parameter gradients
        optimizer.zero_grad()
        # compute predictions
        outputs = (net(inputs))
        print(outputs)
        print(labels)
        # compute loss
        loss = criterion(outputs, labels)
        # compute gradients
        loss.backward()
        # update parameters
        optimizer.step()
        # statistics to display
        train_loss += loss.item()
        predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    acc = 100.*correct/total
    print('Train accuracy:', acc)
    tb.add_scalar('Loss', train_loss, epoch)
    tb.add_scalar('Accuracy', acc, epoch)
    print(net.conv1.bias.grad.norm())
    return(train_loss)


def main(num_epochs):
    for epoch in range(num_epochs):
        print('\nEpoch: %d' % epoch)
        train_loss = train(epoch)
        print(train_loss)
        test_accuracy = test()
    tb.close()
        

In [42]:
main(20)


Epoch: 0


RuntimeError: shape '[1, 3, 125]' is invalid for input of size 129