In [1]:
import os
import nltk
import re
import numpy as np
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
from nltk.tokenize import  word_tokenize
device = 'cuda' if torch.cuda.is_available() else 'cpu'
from data_preprocessing import open_data, tokenize, tag_document, data_to_seq, glove_emb_matrix
from validation import precision, recall, f1, retrive_phrase


#  DATA PREPROCESSING

In [2]:
nltk.download('punkt')

documents = {}
labels = {}

#directories
dir_Tu= "/Users/kmirai/Downloads/NLPProject-master/Hulth2003/Training"

dir_valeria_train = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"
dir_valeria_val = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Validation"
dir_valeria_test = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Test"

dir_anna = "/Users/annasotnikova/Downloads/Hulth2003/Training"


[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#open data
documents_train, labels_train = open_data(dir_valeria_train)
documents_val, labels_val = open_data(dir_valeria_val)
documents_test, labels_test = open_data(dir_valeria_test)

# tokenize data
tokenized_documents_train, tokenized_labels_train = tokenize(documents_train, labels_train)
tokenized_documents_val, tokenized_labels_val = tokenize(documents_val, labels_val)
tokenized_documents_test, tokenized_labels_test = tokenize(documents_test, labels_test)

# create sequence of labels (tags) for the documents
tags_train = tag_document(tokenized_documents_train, tokenized_labels_train)
tags_val = tag_document(tokenized_documents_val, tokenized_labels_val)
tags_test = tag_document(tokenized_documents_test, tokenized_labels_test)

# GLOVE embeddings

In [4]:
# here we download pretrained glove embeddings
import numpy as np
glove = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove[word] = coefs
f.close()

glove_size = 100

In [5]:
# Create vocabulary from all data 
X_train_eng = [doc for doc in copy.deepcopy(tokenized_documents_train).values()]
X_val_eng = [doc for doc in copy.deepcopy(tokenized_documents_val).values()]
X_test_eng = [doc for doc in copy.deepcopy(tokenized_documents_test).values()]
X_full = X_train_eng + X_val_eng + X_test_eng

# Our vocab: all the words in all abstracts
target_vocab = list(set([token for doc in X_full for token in doc]))
# Dictionary with all words and their indices
vocab_ind_dict = dict(zip(target_vocab, range(0, len(target_vocab)))) 
# Embedding matrix
embed_matrix = glove_emb_matrix(vocab_ind_dict, glove, glove_size)

In [6]:
# Prepare data for network
X_train = data_to_seq(X_train_eng, vocab_ind_dict)
X_val = data_to_seq(X_val_eng, vocab_ind_dict)
X_test = data_to_seq(X_test_eng, vocab_ind_dict)
kp_train = [doc for doc in copy.deepcopy(tokenized_labels_train).values()]
tags_train = [doc for doc in copy.deepcopy(tags_train).values()]
kp_val = [doc for doc in copy.deepcopy(tokenized_labels_val).values()]
tags_val = [doc for doc in copy.deepcopy(tags_val).values()]
kp_test = [doc for doc in copy.deepcopy(tokenized_labels_test).values()]
tags_test = [doc for doc in copy.deepcopy(tags_test).values()]

# BiLSTM Net

In [7]:
class LSTM(nn.Module):
    def __init__(self, nb_layers = 1, nb_lstm_units=150, nb_lin_units=150, embedding_dim=100, batch_size=1):
        super(LSTM, self).__init__()
        
        self.vocab = target_vocab
        self.nb_lstm_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.nb_lin_units = nb_lin_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.nb_tags = 3
        nb_vocab_words = len(self.vocab)

        
        # Embedding layer
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim)
        self.word_embedding.load_state_dict({'weight': torch.Tensor(embed_matrix)})
        self.word_embedding.weight.requires_grad = False
        # LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True, bidirectional=True)
        
        # dropout layer
        self.drop = torch.nn.Dropout(p=0.25)

        # linear layers
        self.lin1 = nn.Linear(2*self.nb_lstm_units, self.nb_lin_units)
        self.lin2 = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_b = torch.randn(2, self.batch_size, self.nb_lstm_units).to(device)
        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X):
        
        self.hidden = self.init_hidden()
        X = self.word_embedding(X)
        batch_size, seq_len, _ = X.size()

        X, self.hidden = self.lstm(X, self.hidden)
        X = self.drop(X)
        X = X.contiguous()
        X = X.view(-1, X.shape[2])
        X = F.relu(self.lin1(X))
        X = self.drop(X)
        X = self.lin2(X)

        X = X.view(batch_size, self.nb_tags, -1)

        return X

In [8]:
net = LSTM().to(device)
weight = torch.tensor([1/10, 1, 1]).to(device)
criterion= torch.nn.CrossEntropyLoss(weight)
optimizer = torch.optim.RMSprop(net.parameters(), lr=0.001)

In [9]:
def train(X, y, epoch):
    train_loss = 0.0
    total = 0
    correct = 0
    for i, doc in enumerate(X):
        inputs = torch.LongTensor(doc).view([1, len(doc)]).to(device)
        labels = torch.LongTensor(y[i]).view([1, len(y[i])]).to(device) 
        # zero the parameter gradients
        optimizer.zero_grad()
        # compute predictions
        outputs = (net(inputs))
        # compute loss
        loss = criterion(outputs, labels)
        # compute gradients
        loss.backward()
        # update parameters
        optimizer.step()
        # statistics to display
        train_loss += loss.item()
        _,predicted = outputs.max(1)
        total += labels[0].size(0)
        correct += predicted.eq(labels).sum().item()
        #if i==0: 
        #    print(predicted)

    acc = 100.*correct/total
    print('Train Accuracy:', acc)
    print('Train Loss:', train_loss)
    return(train_loss)


def document_prediction(document_seq, model):
    inputs = torch.LongTensor(document_seq).view([1, len(document_seq)]).to(device) 
    outputs = (net(inputs))
    _,predicted = outputs.max(1)
    return predicted

def validate(documents_eng, kp_eng, documents_seq, tags, model, epoch):
    prec = 0
    rec = 0
    f_score = 0
    acc = 0
    for idx, document_eng in enumerate(documents_eng):
        # our document
        document_seq = documents_seq[idx]
        kp_true = kp_eng[idx]
        tags_true = tags[idx]
        # predicted tags
        tags_predicted = document_prediction(document_seq, model)
        tags_predicted = tags_predicted.cpu().numpy()[0]
        # predicted kp
        kp_predicted = retrive_phrase(tags_predicted, document_eng)
        # compute precision, recall, f_score, accuracy
        prec += precision(kp_true, kp_predicted)
        rec += recall(kp_true, kp_predicted)
        f_score += f1(kp_true, kp_predicted)
        acc += sum(np.equal(tags_true, tags_predicted))/len(tags_true)
        if idx == 1 and epoch%10 == 0:
            print("kp_true",kp_true)
            print("tags_predicted", tags_predicted)
            print("kp_predicted", kp_predicted)
    return prec/len(documents_eng), rec/len(documents_eng), f_score/len(documents_eng), acc/len(documents_eng)
        
        
def main(num_epochs, net):
    for epoch in range(num_epochs):
        print('\nEpoch: %d' % epoch)
        net = net.train() 
        train_loss = train(X_train, tags_train, epoch)
        #get predictions, and labels, map
        net = net.eval() 
        pr, r, f, acc = validate(X_val_eng, kp_val, X_val, tags_val, net, epoch)
        print('Validation Accuracy:', acc) 
        print('Validation Precision:', pr)
        print('Validation Recall:', r)
        print('Validation F-score:', f) 
        
    
    
    


In [10]:
main(30, net)


Epoch: 0
Train Accuracy: 34.819946955621425
Train Loss: 1093.2722862362862
kp_true [['single-phase', 'half-bridge', 'rectifier', 'topology'], ['neutral', 'point', 'switch', 'clamped', 'scheme'], ['pwm', 'control', 'schemes'], ['power', 'quality', 'compensation'], ['sinusoidal', 'line', 'current'], ['current', 'distortion'], ['power', 'switches', 'control', 'signals'], ['dc', 'link', 'voltage', 'balance', 'compensator'], ['line', 'current', 'controller'], ['dc', 'link', 'voltage', 'regulator'], ['hysteresis', 'current', 'control', 'scheme'], ['line', 'current', 'command', 'tracking'], ['harmonic', 'currents', 'elimination'], ['circuit', 'configuration']]
tags_predicted [2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0
 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2
 1 0 2 1 0 1 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1
 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 0 2 1 2]
kp_predicted [['single-phase', 'half-br

Train Accuracy: 61.103152878437605
Train Loss: 772.8805380536942
Validation Accuracy: 0.5211996564622914
Validation Precision: 0.020655575682192614
Validation Recall: 0.07888103683389802
Validation F-score: 0.03106611437602478

Epoch: 20
Train Accuracy: 62.38245847929993
Train Loss: 753.2066877511679
kp_true [['single-phase', 'half-bridge', 'rectifier', 'topology'], ['neutral', 'point', 'switch', 'clamped', 'scheme'], ['pwm', 'control', 'schemes'], ['power', 'quality', 'compensation'], ['sinusoidal', 'line', 'current'], ['current', 'distortion'], ['power', 'switches', 'control', 'signals'], ['dc', 'link', 'voltage', 'balance', 'compensator'], ['line', 'current', 'controller'], ['dc', 'link', 'voltage', 'regulator'], ['hysteresis', 'current', 'control', 'scheme'], ['line', 'current', 'command', 'tracking'], ['harmonic', 'currents', 'elimination'], ['circuit', 'configuration']]
tags_predicted [1 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 2 0 2 0 2 