In [1]:
import os
import nltk
import re
import numpy as np
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
from nltk.tokenize import  word_tokenize
device = 'cuda' if torch.cuda.is_available() else 'cpu'



# OPEN AND TOKENIZE DATA

In [57]:
nltk.download('punkt')

documents = {}
labels = {}

#directories
dir_Tu= "/Users/kmirai/Downloads/NLPProject-master/Hulth2003/Training"

dir_valeria_train = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"

dir_valeria_val = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Validation"

dir_anna = "/Users/annasotnikova/Downloads/Hulth2003/Training"


# finction opening the data
def open_data(directory):
    documents = {}
    for file in os.listdir(directory):
        if file.endswith(".abstr"):
            content = open(("%s/%s" % (directory, file)), "r").read()
            content = re.sub('\s+',' ',content).lower()
            documents[file.split('.')[0]] = content
               
    for file in os.listdir(directory):
        if file.endswith(".uncontr"):
            content = open(("%s/%s" % (directory, file)), "r").read()
            content = re.sub('\s+',' ',content).lower()
            labels[file.split('.')[0]] = content.split("; ")
            
    return documents, labels

# tokenization function for documents
def tokenize_doc(doc):
    tokenized_doc = {}
    for num, ctt in doc.items():
        words = nltk.word_tokenize(ctt)
        tokenized_doc[num] = words
    return tokenized_doc

# tokenization function for labels
def tokenize_labels(doc):
    tokenized_doc = {}
    for num, ctt in doc.items():
        tokenized_doc[num] = []
        for phrase in ctt:
            words = nltk.word_tokenize(phrase)
            tokenized_doc[num].append(words)
    return tokenized_doc
    

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
documents_train, labels_train = open_data(dir_valeria_train)
documents_val, labels_val = open_data(dir_valeria_val)

tokenized_documents_train = tokenize_doc(documents_train)
tokenized_labels_train = tokenize_labels(labels_train)

tokenized_documents_val = tokenize_doc(documents_val)
tokenized_labels_val = tokenize_labels(labels_val)


# CREATE LABEL TAGS

In [59]:
def tag_labels(tokenized_documents, tokenized_labels):
    # create dictionary of labels associated to words
    class_labels = copy.deepcopy(tokenized_documents)
    for key in tokenized_documents.keys():
        doc = tokenized_documents[key]
        kps = tokenized_labels[key]
        class_labels[key] = [0]*len(class_labels[key])
        # cycle over keyphrases
        for kp in kps:
            # find indices of keyphrases in text
            idx = [(i, i+len(kp)) for i in range(len(doc)-len(kp)+1) if doc[i:i+len(kp)] == kp]
            # replace labels with 1,2 
            for j in range(len(idx)):
                class_labels[key][idx[j][0]] = 1
                kp_len = idx[j][1]-idx[j][0]
                class_labels[key][idx[j][0]+1:idx[j][1]] = [2]*(kp_len-1)
    return class_labels

In [62]:
y_train = tag_labels(tokenized_documents_train, tokenized_labels_train)

In [63]:
lab_flattened =  [val for sublist in y_train.values() for val in sublist]
print('zero', lab_flattened.count(0))
print('one', lab_flattened.count(1))
print('two', lab_flattened.count(2))

zero 119852
one 9652
two 11510


# GLOVE embeddings

In [64]:
# here we download pretrained glove embeddings
import numpy as np
glove = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove[word] = coefs
f.close()

glove_size = 100

In [65]:
# Create vocabulary from all data 
X_train = [doc for doc in copy.deepcopy(tokenized_documents_train).values()]
y_train = [doc for doc in copy.deepcopy(y_train).values()]
X_val = [doc for doc in copy.deepcopy(tokenized_documents_val).values()]
X_full = X_train + X_val

# our vocab: all the words in all abstracts
target_vocab = list(set([token for doc in X_full for token in doc]))
# dictionary with all words and their indices
vocab_ind_dict = dict(zip(target_vocab, range(0, len(target_vocab)))) 


# create embedding matrix, initialize words which are not in glove with zeros 
matrix_len = len(target_vocab)
embed_matrix = np.zeros((matrix_len, glove_size))
for i, token in enumerate(target_vocab):
    try: 
        embed_matrix[i] = glove[token]
    except KeyError:
        embed_matrix[i] = np.zeros(shape=(glove_size))



# replace words in our data with their indices
for i, doc in enumerate(X_train):
    for j, token in enumerate(doc):
        X_train[i][j] = vocab_ind_dict[token]




# BiLSTM Net

In [66]:
class LSTM(nn.Module):
    def __init__(self, nb_layers = 1, nb_lstm_units=150, nb_lin_units=150, embedding_dim=100, batch_size=1):
        super(LSTM, self).__init__()
        
        self.vocab = target_vocab
        self.nb_lstm_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.nb_lin_units = nb_lin_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.nb_tags = 3
        nb_vocab_words = len(self.vocab)

        
        # Embedding layer
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim)
        self.word_embedding.load_state_dict({'weight': torch.Tensor(embed_matrix)})
        self.word_embedding.weight.requires_grad = False
        # LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True, bidirectional=True)
        
        # dropout layer
        self.drop = torch.nn.Dropout(p=0.25)

        # linear layers
        self.lin1 = nn.Linear(2*self.nb_lstm_units, self.nb_lin_units)
        self.lin2 = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_b = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X):
        
        self.hidden = self.init_hidden()
        X = self.word_embedding(X)
        batch_size, seq_len, _ = X.size()

        X, self.hidden = self.lstm(X, self.hidden)
        #X = self.drop(X)
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        X = F.relu(self.lin1(X))
        X = F.softmax(self.lin2(X))

        X = X.view(batch_size, self.nb_tags, -1)

        return X

In [67]:
net = LSTM().to(device)
weight = torch.tensor([1/10, 1, 1]).to(device)
criterion= torch.nn.CrossEntropyLoss(weight)
optimizer = torch.optim.RMSprop(net.parameters(), lr=0.001)


In [68]:

def train(epoch):
    train_loss = 0.0
    total = 0
    correct = 0
    for i, sent in enumerate(X_train):
        inputs = torch.LongTensor(sent).view([1, len(sent)]).to(device)
        labels = torch.LongTensor(y_train[i]).view([1, len(y_train[i])]).to(device) 
        # zero the parameter gradients
        optimizer.zero_grad()
        # compute predictions
        outputs = (net(inputs))
        # compute loss
        loss = criterion(outputs, labels)
        # compute gradients
        loss.backward()
        # update parameters
        optimizer.step()
        # statistics to display
        train_loss += loss.item()
        _,predicted = outputs.max(1)
        total += labels[0].size(0)
        correct += predicted.eq(labels).sum().item()
        if i == 1:
            print(predicted)
            print(labels)
        

    acc = 100.*correct/total
    print('Train Accuracy:', acc)
    print('Train Loss:', train_loss)
    return(train_loss)
        
        
        

def main(num_epochs):
    for epoch in range(num_epochs):
        print('\nEpoch: %d' % epoch)
        train_loss = train(epoch)
    
    


In [69]:
main(300)


Epoch: 0
tensor([[0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 1, 0, 2, 1,
         0, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 0, 2, 1, 0, 2, 1,
         0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1,
         0, 2, 1, 0, 2, 1, 0, 2, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1,
         0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 2, 1, 0, 2, 0, 0, 2, 0, 2, 1,
         0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1,
         0]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,



Train Accuracy: 33.63708567943608
Train Loss: 1098.6409599781036

Epoch: 1
tensor([[2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 2, 0, 0, 1, 0, 2, 1, 0, 0, 1, 0,
         2, 1, 0, 2, 1, 0, 2, 1, 1, 2, 1, 1, 2, 2, 0, 2, 1, 0, 2, 1, 0, 2, 1, 1,
         2, 1, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 0, 2, 1, 0,
         2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 2, 2, 0,
         2, 1, 0, 0, 1, 0, 2, 1, 0, 2, 1, 0, 2, 2, 0, 0, 2, 0, 2, 1, 0, 2, 1, 0,
         2, 1, 0, 2, 1, 1, 2, 2, 1, 2, 2, 0, 2, 2, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

Train Accuracy: 37.48563972371537
Train Loss: 1022.2896387577057

Epoch: 9
tensor([[0, 1, 1, 2, 1, 2, 2, 0, 0, 2, 1, 0, 2, 1, 1, 2, 1, 2, 0, 1, 1, 2, 1, 1,
         2, 2, 1, 2, 1, 0, 2, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 1, 1, 0, 2, 0, 1, 1,
         0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 1, 0, 2, 1, 2, 0, 1, 1,
         0, 2, 2, 1, 0, 1, 2, 0, 2, 1, 0, 1, 2, 0, 1, 0, 0, 1, 2, 1, 0, 2, 1, 0,
         2, 1, 1, 2, 1, 0, 2, 1, 1, 2, 1, 1, 2, 0, 1, 2, 1, 1, 2, 0, 1, 2, 2, 2,
         0, 1, 1, 2, 0, 1, 2, 0, 1, 2, 1, 1, 2, 1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 0,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

Train Accuracy: 41.338448664671596
Train Loss: 925.6544141769409

Epoch: 17
tensor([[0, 2, 2, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 2, 0, 1, 0, 1, 0, 2, 1, 0,
         0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 1, 2, 1, 0, 0, 2, 1, 2, 2, 0, 1, 2, 1,
         0, 0, 2, 0, 1, 1, 0, 1, 1, 0, 1, 2, 0, 1, 0, 2, 1, 0, 2, 1, 1, 0, 0, 1,
         2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 1, 2, 0, 2, 2, 1, 0, 2, 1, 0,
         2, 1, 2, 2, 1, 0, 2, 1, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1, 2, 0, 1, 2, 0, 2,
         0, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 0, 2, 1, 2, 0, 2, 1, 0, 1, 1,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

Train Accuracy: 42.595061483256984
Train Loss: 896.9001587629318

Epoch: 25
tensor([[0, 2, 2, 2, 1, 0, 0, 1, 2, 2, 1, 0, 2, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 2,
         0, 2, 0, 0, 2, 0, 0, 1, 2, 0, 1, 2, 2, 1, 0, 1, 0, 1, 2, 0, 2, 1, 1, 1,
         2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 0, 1, 1, 2, 1, 0, 0, 1, 1, 0, 1, 1,
         2, 0, 1, 0, 0, 0, 1, 0, 2, 2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 1, 2, 2, 1, 0,
         0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 1, 1, 0, 2,
         0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 2, 2, 1, 2, 1, 0, 1, 2, 0, 2, 1, 0, 0, 1,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

Train Accuracy: 43.26733515821124
Train Loss: 880.2718498706818

Epoch: 33
tensor([[0, 2, 2, 2, 2, 0, 0, 2, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 1, 2,
         0, 1, 1, 0, 2, 1, 0, 1, 2, 0, 1, 2, 2, 2, 0, 1, 0, 1, 2, 2, 1, 1, 0, 1,
         2, 1, 2, 0, 1, 1, 0, 1, 2, 0, 1, 2, 0, 1, 0, 2, 1, 0, 2, 1, 2, 0, 1, 1,
         2, 0, 1, 0, 0, 0, 2, 0, 2, 2, 1, 1, 2, 0, 1, 2, 2, 0, 0, 2, 1, 0, 1, 1,
         2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 1, 0, 2,
         2, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 1,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

Train Accuracy: 43.665877146949946
Train Loss: 872.0020631551743

Epoch: 41
tensor([[0, 2, 2, 2, 1, 0, 0, 1, 1, 2, 1, 0, 2, 1, 0, 2, 2, 0, 1, 2, 0, 2, 1, 2,
         0, 1, 2, 0, 2, 0, 0, 2, 0, 0, 1, 1, 2, 2, 0, 1, 0, 1, 2, 0, 2, 1, 2, 1,
         1, 0, 2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 1, 1,
         2, 2, 1, 0, 0, 0, 1, 0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 1, 2,
         0, 1, 2, 0, 1, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 2, 2, 0, 1, 0, 2,
         2, 1, 1, 2, 1, 0, 2, 1, 0, 0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 1,
         2]], device='cuda:0')
tensor([[1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
         2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

KeyboardInterrupt: 