In [217]:
# Author: Robert Guthrie
# Modifications: Volodymyr Lut; lut@ucu.edu.ua

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from conllu import parse
import numpy as np
from tqdm import tqdm

import tensorflow as tf

import pandas as pd

from statistics import harmonic_mean

torch.manual_seed(1)

<torch._C.Generator at 0x1194c28b0>

In [17]:
TEST_FILE_PATH = 'test.txt'
TRAIN_FILE_PATH = 'train.txt'
DEV_FILE_PATH = 'dev.txt'

In [138]:
def parse_file_format(path):
    with open(path, 'r') as f:
        data_raw= f.read()
    paragraphs = data_raw.split('\n\n')
    # drop empty lines
    paragraphs = [x for x in paragraphs if len(x) > 0]
    res = []
    for paragraph in paragraphs:
        res.append([x.split() for x in paragraph.split('\n')])
    return res

In [139]:
test = parse_file_format(TEST_FILE_PATH)
test.pop(0)
test[0]

[['SOCCER', 'NN', 'B-NP', 'O'],
 ['-', ':', 'O', 'O'],
 ['JAPAN', 'NNP', 'B-NP', 'B-LOC'],
 ['GET', 'VB', 'B-VP', 'O'],
 ['LUCKY', 'NNP', 'B-NP', 'O'],
 ['WIN', 'NNP', 'I-NP', 'O'],
 [',', ',', 'O', 'O'],
 ['CHINA', 'NNP', 'B-NP', 'B-PER'],
 ['IN', 'IN', 'B-PP', 'O'],
 ['SURPRISE', 'DT', 'B-NP', 'O'],
 ['DEFEAT', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

In [140]:
train = parse_file_format(TRAIN_FILE_PATH)
train.pop(0)

[['-DOCSTART-', '-X-', '-X-', 'O']]

In [141]:
dev = parse_file_format(DEV_FILE_PATH)
dev.pop(0)

[['-DOCSTART-', '-X-', '-X-', 'O']]

In [142]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model... ")
    f = open(gloveFile,'r')
    model = {}
    all_vectors = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        all_vectors.append(embedding)
        model[word] = embedding
    print("Calculating average vector...")
    model['UNKNOWN'] = np.mean(all_vectors, axis=0)
    print("Done.",len(model)," words loaded!")
    
    return model

In [143]:
# Wikipedia 2014 + Gigaword 5 word 2 vec pre-trained glove file downloaded from https://nlp.stanford.edu/projects/glove/
model = loadGloveModel('glove.6B.50d.txt')

Loading Glove Model... 
Calculating average vector...
Done. 400001  words loaded!


In [144]:
model['UNKNOWN']

array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
       -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
        0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
       -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
       -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
        0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
       -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
        0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
        0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
       -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])

In [145]:
data = train + test + dev

In [146]:
def calc_strat_1(x):
    try:
        val = model[x]
    except:
        val = model['UNKNOWN']
    return val    
strat_1 = [x + [calc_strat_1(x[0])] for paragraph in data for x in paragraph]
strat_1[0]

['EU',
 'NNP',
 'B-NP',
 'B-ORG',
 array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
        -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
         0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
        -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
        -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
         0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
        -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
         0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
         0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
        -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])]

In [147]:
strat_2 = [x + [calc_strat_1(x[0].lower())] for paragraph in data for x in paragraph]

In [148]:
def calc_strat_3(x):
    val = model['UNKNOWN']
    try:
        val = model[x]
    except:
        try:
            val = model[x.lower()]
        except:
            val = model['UNKNOWN']
    return val    

strat_3 = [x + [calc_strat_3(x[0])] for paragraph in data for x in paragraph]

In [149]:
def calculate_strat_general(data, strat = 1):
    if(strat != 1 and strat != 2 and strat != 3):
        raise Exception('Invalid strat param')
    else:
        if strat == 1:
            return [x + [calc_strat_1(x[0])] for paragraph in data for x in paragraph]
        if strat == 2:
            return [x + [calc_strat_1(x[0].lower())] for paragraph in data for x in paragraph]
        if strat == 3:
            return [x + [calc_strat_3(x[0])] for paragraph in data for x in paragraph]

In [160]:
def prepare_sequence(seq, to_ix, tags = False):
    i = 3 if tags else 0
    idxs = [to_ix[w[i]] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = train

word_to_ix = {}
tag_to_ix = {}

temp = [(x[0], x[3]) for paragraph in training_data for x in paragraph]

for entry in tqdm(temp):
    word = entry[0]
    tag = entry[1]
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

100%|██████████| 204566/204566 [00:00<00:00, 807695.78it/s]


In [161]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [222]:
def weighted_harmonic_mean(prec, rec, b):
    return (1 + b**2) * (prec * rec) / (((b**2 * prec) + rec) if prec > 0 or rec > 0 else 1)

In [227]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

batch_size = 128
num_epochs = 5

batches = [training_data[x:x+batch_size] for x in range(0, len(tag_to_ix), batch_size)]

batches[0]
for epoch in range(num_epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    print("Epoch " + str(epoch))
    for batch in batches:
        for sentence in batch:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.

            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(sentence, tag_to_ix, True)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)
            
            predicted = [int(np.argmax(tag_scores[i].clone().detach())) for i in range(tag_scores.shape[0])]
            actual = [tag_to_ix[x[3]] for x in sentence]
            
            tags_num = range(len(tag_to_ix))
            
            true_positives = len([1 for label in tags_num for (x, y) in zip(predicted, actual) if x == label and y == label])
            false_positives = len([1 for label in tags_num for (x, y) in zip(predicted, actual) if x == label and y != label])
            false_negatives = len([1 for label in tags_num for (x, y) in zip(predicted, actual) if x != label and y == label])
            
            precision = (true_positives) / ((true_positives + false_positives) if true_positives > 0 or false_positives > 0 else 1)
            recall = (true_positives) / ((true_positives + false_negatives) if true_positives > 0 or false_negatives > 0 else 1)
            
            F1 = weighted_harmonic_mean(precision, recall, 1)
            F05 = weighted_harmonic_mean(precision, recall, 0.5)
            
            print("TP " + str(true_positives))
            print("FP " + str(false_positives))
            print("FN " + str(false_negatives))
            print("Precision " + str(precision))
            print("Recall " + str(recall))
            print("F1 " + str(F1))
            print("F0.5 " + str(F05))
            print("-----------------")
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            # Uncomment if needed
            # print('Current batch loss: {}'.format(loss.item()))
            loss.backward()
            optimizer.step()

# # See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0], word_to_ix)
    tag_scores = model(inputs)
    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-2.2200, -2.2185, -2.3507, -1.9799, -2.3198, -2.3113, -2.2181, -2.0081,
         -2.2188],
        [-2.2103, -2.2594, -2.2221, -2.0354, -2.2930, -2.3396, -2.2231, -2.0391,
         -2.1968],
        [-2.3193, -2.1660, -2.3025, -2.0799, -2.3362, -2.2725, -2.3331, -1.9539,
         -2.0906],
        [-2.2414, -2.2289, -2.2562, -2.0633, -2.2877, -2.3157, -2.2847, -2.0148,
         -2.1294],
        [-2.3317, -2.1780, -2.3493, -2.0421, -2.3396, -2.1733, -2.3780, -2.0024,
         -2.0675],
        [-2.1848, -2.1752, -2.2542, -2.0731, -2.2462, -2.3082, -2.3503, -2.0367,
         -2.1881],
        [-2.2230, -2.1086, -2.2081, -2.1311, -2.1268, -2.1756, -2.4277, -2.0836,
         -2.3412],
        [-2.0954, -2.2086, -2.2659, -2.1650, -2.1842, -2.2595, -2.2787, -2.0571,
         -2.2884],
        [-2.2238, -2.1477, -2.3191, -2.0329, -2.3344, -2.2712, -2.2174, -2.0684,
         -2.2038]])
Epoch 0
TP 0
FP 9
FN 9
Precision 0.0
Recall 0.0
F1 0.0
F0.5 0.0
-----------------
TP 0
FP 2
FN 2
Pr

TP 9
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 8
FP 1
FN 1
Precision 0.8888888888888888
Recall 0.8888888888888888
F1 0.8888888888888888
F0.5 0.8888888888888888
-----------------
TP 9
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 7
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 5
FP 2
FN 2
Precision 0.7142857142857143
Recall 0.7142857142857143
F1 0.7142857142857143
F0.5 0.7142857142857142
-----------------
TP 5
FP 2
FN 2
Precision 0.7142857142857143
Recall 0.7142857142857143
F1 0.7142857142857143
F0.5 0.7142857142857142
-----------------
TP 1
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 8
FP 1
FN 1
Precision 0.8888888888888888
Recall 0.8888888888888888
F1 0.8888888888888888
F0.5 0.8888888888888888
-----------------
TP 1
FP 1
FN 1
Precision 0.5
Recall 0.5
F1 0.5
F0.5 0.5
-----------------
TP 10
FP 3
FN 3
Precision 0.7692307692307693
Recall 0.7692307692307693
F1 0.7692307692307693
F

TP 27
FP 3
FN 3
Precision 0.9
Recall 0.9
F1 0.9
F0.5 0.9000000000000001
-----------------
TP 31
FP 2
FN 2
Precision 0.9393939393939394
Recall 0.9393939393939394
F1 0.9393939393939394
F0.5 0.9393939393939394
-----------------
TP 12
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 1
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 6
FP 2
FN 2
Precision 0.75
Recall 0.75
F1 0.75
F0.5 0.75
-----------------
TP 1
FP 1
FN 1
Precision 0.5
Recall 0.5
F1 0.5
F0.5 0.5
-----------------
TP 26
FP 9
FN 9
Precision 0.7428571428571429
Recall 0.7428571428571429
F1 0.7428571428571429
F0.5 0.7428571428571429
-----------------
TP 39
FP 8
FN 8
Precision 0.8297872340425532
Recall 0.8297872340425532
F1 0.8297872340425532
F0.5 0.8297872340425532
-----------------
TP 29
FP 5
FN 5
Precision 0.8529411764705882
Recall 0.8529411764705882
F1 0.8529411764705882
F0.5 0.8529411764705882
-----------------
TP 15
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
------------

TP 10
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 32
FP 10
FN 10
Precision 0.7619047619047619
Recall 0.7619047619047619
F1 0.7619047619047619
F0.5 0.7619047619047619
-----------------
TP 21
FP 9
FN 9
Precision 0.7
Recall 0.7
F1 0.7
F0.5 0.7
-----------------
TP 19
FP 3
FN 3
Precision 0.8636363636363636
Recall 0.8636363636363636
F1 0.8636363636363636
F0.5 0.8636363636363636
-----------------
TP 12
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 12
FP 1
FN 1
Precision 0.9230769230769231
Recall 0.9230769230769231
F1 0.9230769230769231
F0.5 0.9230769230769231
-----------------
TP 3
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 11
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 19
FP 1
FN 1
Precision 0.95
Recall 0.95
F1 0.9500000000000001
F0.5 0.9500000000000001
-----------------
TP 33
FP 2
FN 2
Precision 0.9428571428571428
Recall 0.9428571428571428
F1 0.9428571428571428
F0.5 0.942857142857

TP 19
FP 3
FN 3
Precision 0.8636363636363636
Recall 0.8636363636363636
F1 0.8636363636363636
F0.5 0.8636363636363636
-----------------
TP 4
FP 4
FN 4
Precision 0.5
Recall 0.5
F1 0.5
F0.5 0.5
-----------------
TP 1
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 6
FP 2
FN 2
Precision 0.75
Recall 0.75
F1 0.75
F0.5 0.75
-----------------
TP 1
FP 1
FN 1
Precision 0.5
Recall 0.5
F1 0.5
F0.5 0.5
-----------------
TP 9
FP 2
FN 2
Precision 0.8181818181818182
Recall 0.8181818181818182
F1 0.8181818181818182
F0.5 0.8181818181818183
-----------------
TP 1
FP 2
FN 2
Precision 0.3333333333333333
Recall 0.3333333333333333
F1 0.3333333333333333
F0.5 0.33333333333333337
-----------------
TP 7
FP 1
FN 1
Precision 0.875
Recall 0.875
F1 0.875
F0.5 0.875
-----------------
TP 8
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 8
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 7
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
----------

TP 13
FP 2
FN 2
Precision 0.8666666666666667
Recall 0.8666666666666667
F1 0.8666666666666667
F0.5 0.8666666666666667
-----------------
TP 15
FP 1
FN 1
Precision 0.9375
Recall 0.9375
F1 0.9375
F0.5 0.9375
-----------------
TP 1
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 9
FP 1
FN 1
Precision 0.9
Recall 0.9
F1 0.9
F0.5 0.9000000000000001
-----------------
TP 2
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 32
FP 3
FN 3
Precision 0.9142857142857143
Recall 0.9142857142857143
F1 0.9142857142857143
F0.5 0.9142857142857143
-----------------
TP 28
FP 7
FN 7
Precision 0.8
Recall 0.8
F1 0.8000000000000002
F0.5 0.8000000000000002
-----------------
TP 30
FP 2
FN 2
Precision 0.9375
Recall 0.9375
F1 0.9375
F0.5 0.9375
-----------------
TP 27
FP 3
FN 3
Precision 0.9
Recall 0.9
F1 0.9
F0.5 0.9000000000000001
-----------------
TP 31
FP 2
FN 2
Precision 0.9393939393939394
Recall 0.9393939393939394
F1 0.9393939393939394
F0.5 0.9393939393939394
-------

FP 3
FN 3
Precision 0.88
Recall 0.88
F1 0.88
F0.5 0.8799999999999999
-----------------
TP 12
FP 1
FN 1
Precision 0.9230769230769231
Recall 0.9230769230769231
F1 0.9230769230769231
F0.5 0.9230769230769231
-----------------
TP 21
FP 4
FN 4
Precision 0.84
Recall 0.84
F1 0.8399999999999999
F0.5 0.8399999999999999
-----------------
TP 10
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 32
FP 10
FN 10
Precision 0.7619047619047619
Recall 0.7619047619047619
F1 0.7619047619047619
F0.5 0.7619047619047619
-----------------
TP 21
FP 9
FN 9
Precision 0.7
Recall 0.7
F1 0.7
F0.5 0.7
-----------------
TP 19
FP 3
FN 3
Precision 0.8636363636363636
Recall 0.8636363636363636
F1 0.8636363636363636
F0.5 0.8636363636363636
-----------------
TP 12
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 12
FP 1
FN 1
Precision 0.9230769230769231
Recall 0.9230769230769231
F1 0.9230769230769231
F0.5 0.9230769230769231
-----------------
TP 3
FP 0
FN 0
Precision 1.0
Recall 1.0

TP 9
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 8
FP 1
FN 1
Precision 0.8888888888888888
Recall 0.8888888888888888
F1 0.8888888888888888
F0.5 0.8888888888888888
-----------------
TP 9
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 7
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 5
FP 2
FN 2
Precision 0.7142857142857143
Recall 0.7142857142857143
F1 0.7142857142857143
F0.5 0.7142857142857142
-----------------
TP 5
FP 2
FN 2
Precision 0.7142857142857143
Recall 0.7142857142857143
F1 0.7142857142857143
F0.5 0.7142857142857142
-----------------
TP 1
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 8
FP 1
FN 1
Precision 0.8888888888888888
Recall 0.8888888888888888
F1 0.8888888888888888
F0.5 0.8888888888888888
-----------------
TP 2
FP 0
FN 0
Precision 1.0
Recall 1.0
F1 1.0
F0.5 1.0
-----------------
TP 10
FP 3
FN 3
Precision 0.7692307692307693
Recall 0.7692307692307693
F1 0.7692307692307693
F