In [13]:
import os
import nltk
import re
import numpy as np
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter
from torch.autograd import Variable
device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [14]:
nltk.download('punkt')

documents = {}
labels = {}
p = re.compile(r"(\b[-']\b)|[\W_]")

#directories
dir_valeria = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"
dir_anna = "/Users/annasotnikova/Downloads/Hulth2003/Training"


# finction opening the data, splitting it using "."
def open_data(directory):
    for file in os.listdir(directory):
        if file.endswith(".abstr"):
            content = open(("%s/%s" % (directory, file)), "r").read()
            tmp=[]
            for s in content.split('.'):
                tmp.append( p.sub(lambda m: (m.group(1) if m.group(1) else " "), s) )
            documents[file.split('.')[0]] = tmp

    # open labels        
    # open training labels        
    for file in os.listdir(directory):
        if file.endswith(".uncontr"):
            content = open(("%s/%s" % (directory, file)), "r").read()
            labels[file.split('.')[0]] = content.split("; ")
            
    return documents, labels

# tokenization function
def tokenize(doc):
    tokenized_doc = {}
    for num, ctt in doc.items():
        tokenized_doc[num] = []
        for sentence in ctt:
            words = nltk.word_tokenize(sentence.lower())
            tokenized_doc[num].append(words)
    return tokenized_doc
    

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
documents, labels = open_data(dir_valeria)
tokenized_documents = tokenize(documents)
tokenized_labels = tokenize(labels)

In [16]:
# Here we preprocess labels: we label each word in each sentence with 2 if it is not a key-phrase word, 
# with 0 if it is a first word in key-phrase and 1 if it is not a first word in key-phrase.  

# function finds index of element in list of lists
def index(lab, target):
    for i,phr in enumerate(lab):
        for j, w in enumerate(phr):
            if w == target:
                return (j)
    return (None, None)


def tag_labels(tokenized_documents, tokenized_labels):
    # create dictionary of labels associated to words
    class_labels = copy.deepcopy(tokenized_documents)
    for document in tokenized_documents:
        # take one document
        text = tokenized_documents[document]
        lab = tokenized_labels[document]
        lab_flattened = [val for sublist in lab for val in sublist]
        for i, sentence in enumerate(text): 
            for j, word in enumerate(sentence): 
                is_keyphrase = word in lab_flattened
                if is_keyphrase:
                    if index(tokenized_labels[document], word) == 0:
                        class_labels[document][i][j] = 1
                    else:
                        class_labels[document][i][j] = 2
                else:
                    class_labels[document][i][j] = 0
    return class_labels

In [17]:
class_labels = tag_labels(tokenized_documents, tokenized_labels)

In [18]:
lab_flattened =  [val for sublist in class_labels.values() for val in sublist] 
lab_flattened  = [val for sublist in lab_flattened for val in sublist] 
print('zero', lab_flattened.count(0))
print('one', lab_flattened.count(1))
print('two', lab_flattened.count(2))

zero 97169
one 11942
two 20053


# GLOVE embeddings

In [19]:
# here we download pretrained glove embeddings
import numpy as np
embeddings = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

glove_size = 100

In [20]:
# our data
X = [sent for doc in copy.deepcopy(tokenized_documents).values() for sent in doc if sent!= []]
y = [sent for doc in copy.deepcopy(class_labels).values() for sent in doc if sent!=[]]

# our vocab: all the words in all abstracts
target_vocab = list(set([item for sublist in X for item in sublist]))
# dictionary with all words and their indices
vocab_ind_dict = dict(zip(target_vocab, range(0, len(target_vocab)))) 


# create matrix of glove vectors + random vectors for the words which are in vocab but not in glove

matrix_len = len(target_vocab)
weights_glove = np.zeros((matrix_len, glove_size))

for i, word in enumerate(target_vocab):
    try: 
        weights_glove[i] = embeddings[word]
    except KeyError:
        weights_glove[i] = np.random.normal(scale=0.6, size=(glove_size, ))

# replace words in our data with their indices
for i, sent in enumerate(X):
    for j, word in enumerate(sent):
        X[i][j] = vocab_ind_dict[word]

# BiLSTM Net

In [None]:
class LSTM(nn.Module):
    def __init__(self, nb_layers = 1, nb_lstm_units=150, nb_lin_units=150, embedding_dim=100, batch_size=1):
        super(LSTM, self).__init__()
        
        self.vocab = target_vocab
        self.tags = {'first': 1, 'inside': 2, 'no': 0}
        self.nb_lstm_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.nb_lin_units = nb_lin_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.nb_tags = len(self.tags)
        nb_vocab_words = len(self.vocab)

        
        # Embedding layer
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim)
        self.word_embedding.load_state_dict({'weight': torch.Tensor(weights_glove)})
        self.word_embedding.weight.requires_grad = True

        # LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True, bidirectional=True)
        
        # dropout layer
        self.drop = torch.nn.Dropout(p=0.25)

        # linear layers
        self.lin1 = nn.Linear(2*self.nb_lstm_units, self.nb_lin_units)
        self.lin2 = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_b = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X):
        
        self.hidden = self.init_hidden()
        X = self.word_embedding(X)
        batch_size, seq_len, _ = X.size()

        X, self.hidden = self.lstm(X, self.hidden)
        X = self.drop(X)
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        X = F.relu(self.lin1(X))
        X = F.relu(self.lin2(X))

        X = X.view(batch_size, self.nb_tags, -1)

        return X

In [77]:
net = LSTM().to(device)
weight = torch.tensor([1/50, 1, 1/2]).to(device)
criterion= torch.nn.CrossEntropyLoss(weight=weight)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
tb = SummaryWriter()

In [78]:
# training function should be here 
# Something like 

def train(epoch):
    train_loss = 0.0
    total = 0
    correct = 0
    for i, sent in enumerate(X):
        inputs = torch.LongTensor(sent).view([1, len(sent)]).to(device)
        labels = torch.LongTensor(y[i]).view([1, len(y[i])]).to(device)     
        # zero the parameter gradients
        optimizer.zero_grad()
        # compute predictions
        outputs = (net(inputs))
        # compute loss
        loss = criterion(outputs, labels)
        # compute gradients
        loss.backward()
        # update parameters
        optimizer.step()
        # statistics to display
        train_loss += loss.item()
        _,predicted = outputs.max(1)
        total += labels[0].size(0)
        correct += predicted.eq(labels).sum().item()
        if i ==0:
            print(predicted)
            print(outputs)
            print(labels)

    acc = 100.*correct/total
    print('Train accuracy:', acc)
    print('Loss:', train_loss)
    tb.add_scalar('Loss', train_loss, epoch)
    tb.add_scalar('Accuracy', acc, epoch)
    return(train_loss)


def main(num_epochs):
    for epoch in range(num_epochs):
        print('\nEpoch: %d' % epoch)
        train_loss = train(epoch)
    tb.close()
        

In [79]:
main(20)


Epoch: 0
tensor([[0, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 2, 0, 1, 1, 0, 1, 2, 0, 1, 2,
         0, 1, 2, 0, 0, 2, 0, 1, 2, 0, 1, 1, 0, 1]], device='cuda:0')
tensor([[[0.0636, 0.0403, 0.0000, 0.0606, 0.0106, 0.0000, 0.0412, 0.0185,
          0.0000, 0.0479, 0.0000, 0.0000, 0.0478, 0.0238, 0.0000, 0.0703,
          0.0105, 0.0000, 0.0470, 0.0000, 0.0000, 0.0415, 0.0017, 0.0000,
          0.0639, 0.0033, 0.0000, 0.0266, 0.0114, 0.0000, 0.0358, 0.0000,
          0.0000, 0.0321, 0.0140, 0.0000, 0.0350, 0.0153],
         [0.0000, 0.0000, 0.0051, 0.0000, 0.0304, 0.0294, 0.0000, 0.0470,
          0.0308, 0.0000, 0.0344, 0.0182, 0.0000, 0.0169, 0.0047, 0.0000,
          0.0472, 0.0430, 0.0000, 0.0416, 0.0190, 0.0000, 0.0492, 0.0034,
          0.0000, 0.0303, 0.0145, 0.0000, 0.0098, 0.0290, 0.0000, 0.0078,
          0.0000, 0.0000, 0.0214, 0.0156, 0.0000, 0.0371],
         [0.0204, 0.0000, 0.0363, 0.0000, 0.0000, 0.0563, 0.0000, 0.0000,
          0.0348, 0.0271, 0.0000, 0.0361, 0.0000, 0.000

Train accuracy: 27.671797095165836
Loss: 6073.353940103203

Epoch: 6
tensor([[1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2,
         1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 2]], device='cuda:0')
tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0266, 0.1899, 0.0000, 0.1672, 0.3815, 0.0000, 0.0000,
          0.1486, 0.0000, 0.0469, 0.0684, 0.0000, 0.0777, 0.2433, 0.0000,
          0.4922, 0.6348, 0.0000, 0.6890, 0.5647, 0.0000],
         [0.6737, 0.6052, 0.0000, 0.3091, 0.3682, 0.0000, 0.4158, 0.5069,
          0.0000, 0.4653, 0.4116, 0.0000, 0.3353, 0.4445, 0.0000, 0.5578,
          0.3563, 0.0000, 0.5856, 0.5028, 0.0000, 0.4397, 0.3667, 0.0000,
          0.8243, 0.4575, 0.0000, 0.6735, 0.7884, 0.0000, 0.8623, 0.6721,
          0.0000, 0.8348, 0.7168, 0.0000, 0.5055, 0.8208],
         [0.0000, 0.6990, 0.6798, 0.0000, 0.3753, 0.7778, 0.0000, 0.29

Train accuracy: 19.039360812610326
Loss: 5694.4297565100715

Epoch: 12
tensor([[1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2]], device='cuda:0')
tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0763, 0.0847, 0.1395,
          0.4546, 0.3366, 0.3934, 0.5233, 0.7346, 0.6971, 0.7654, 1.2398,
          1.2314, 1.2017, 1.2992, 1.2759, 1.2557, 1.1598, 1.1177, 1.1025,
          1.3087, 1.2168, 1.1485, 1.5698, 1.5264, 1.4072, 1.4940, 1.5133,
          1.5212, 1.7483, 1.6312, 1.5868, 1.6475, 1.7311],
         [1.8325, 1.6906, 1.5712, 1.5791, 1.7158, 1.6674, 1.5741, 2.0183,
          1.9329, 1.8975, 1.9660, 1.8789, 1.7805, 1.9081, 1.7371, 1.7503,
          1.8590, 1.7592, 1.6602, 1.9510, 1.8812, 1.8034, 2.0374, 2.0531,
          1.9890, 1.9203, 1.8437, 1.8757, 2.0685, 2.1647, 2.4248, 2.1123,
          2.0710, 2.1173, 2.1207, 2.0876, 2.1196, 1.8792],
         [1.7238, 1.7755, 2.0369, 1.9419, 1.9130, 1.8264, 1.7289, 1.

KeyboardInterrupt: 