In [44]:
# Author: Robert Guthrie
# Modifications: Volodymyr Lut; lut@ucu.edu.ua

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from conllu import parse
import numpy as np
from tqdm import tqdm

import pandas as pd

torch.manual_seed(1)

<torch._C.Generator at 0x1194c28b0>

In [17]:
TEST_FILE_PATH = 'test.txt'
TRAIN_FILE_PATH = 'train.txt'
DEV_FILE_PATH = 'dev.txt'

In [138]:
def parse_file_format(path):
    with open(path, 'r') as f:
        data_raw= f.read()
    paragraphs = data_raw.split('\n\n')
    # drop empty lines
    paragraphs = [x for x in paragraphs if len(x) > 0]
    res = []
    for paragraph in paragraphs:
        res.append([x.split() for x in paragraph.split('\n')])
    return res

In [139]:
test = parse_file_format(TEST_FILE_PATH)
test.pop(0)
test[0]

[['SOCCER', 'NN', 'B-NP', 'O'],
 ['-', ':', 'O', 'O'],
 ['JAPAN', 'NNP', 'B-NP', 'B-LOC'],
 ['GET', 'VB', 'B-VP', 'O'],
 ['LUCKY', 'NNP', 'B-NP', 'O'],
 ['WIN', 'NNP', 'I-NP', 'O'],
 [',', ',', 'O', 'O'],
 ['CHINA', 'NNP', 'B-NP', 'B-PER'],
 ['IN', 'IN', 'B-PP', 'O'],
 ['SURPRISE', 'DT', 'B-NP', 'O'],
 ['DEFEAT', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

In [140]:
train = parse_file_format(TRAIN_FILE_PATH)
train.pop(0)

[['-DOCSTART-', '-X-', '-X-', 'O']]

In [141]:
dev = parse_file_format(DEV_FILE_PATH)
dev.pop(0)

[['-DOCSTART-', '-X-', '-X-', 'O']]

In [142]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model... ")
    f = open(gloveFile,'r')
    model = {}
    all_vectors = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        all_vectors.append(embedding)
        model[word] = embedding
    print("Calculating average vector...")
    model['UNKNOWN'] = np.mean(all_vectors, axis=0)
    print("Done.",len(model)," words loaded!")
    
    return model

In [143]:
# Wikipedia 2014 + Gigaword 5 word 2 vec pre-trained glove file downloaded from https://nlp.stanford.edu/projects/glove/
model = loadGloveModel('glove.6B.50d.txt')

Loading Glove Model... 
Calculating average vector...
Done. 400001  words loaded!


In [144]:
model['UNKNOWN']

array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
       -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
        0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
       -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
       -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
        0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
       -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
        0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
        0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
       -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])

In [145]:
data = train + test + dev

In [146]:
def calc_strat_1(x):
    try:
        val = model[x]
    except:
        val = model['UNKNOWN']
    return val    
strat_1 = [x + [calc_strat_1(x[0])] for paragraph in data for x in paragraph]
strat_1[0]

['EU',
 'NNP',
 'B-NP',
 'B-ORG',
 array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
        -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
         0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
        -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
        -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
         0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
        -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
         0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
         0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
        -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])]

In [147]:
strat_2 = [x + [calc_strat_1(x[0].lower())] for paragraph in data for x in paragraph]

In [148]:
def calc_strat_3(x):
    val = model['UNKNOWN']
    try:
        val = model[x]
    except:
        try:
            val = model[x.lower()]
        except:
            val = model['UNKNOWN']
    return val    

strat_3 = [x + [calc_strat_3(x[0])] for paragraph in data for x in paragraph]

In [149]:
def calculate_strat_general(data, strat = 1):
    if(strat != 1 and strat != 2 and strat != 3):
        raise Exception('Invalid strat param')
    else:
        if strat == 1:
            return [x + [calc_strat_1(x[0])] for paragraph in data for x in paragraph]
        if strat == 2:
            return [x + [calc_strat_1(x[0].lower())] for paragraph in data for x in paragraph]
        if strat == 3:
            return [x + [calc_strat_3(x[0])] for paragraph in data for x in paragraph]

In [160]:
def prepare_sequence(seq, to_ix, tags = False):
    i = 3 if tags else 0
    idxs = [to_ix[w[i]] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = train

word_to_ix = {}
tag_to_ix = {}

temp = [(x[0], x[3]) for paragraph in training_data for x in paragraph]

for entry in tqdm(temp):
    word = entry[0]
    tag = entry[1]
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

100%|██████████| 204566/204566 [00:00<00:00, 807695.78it/s]


In [161]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [162]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(sentence, tag_to_ix, True)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-2.0092, -2.0966, -2.0410, -2.1845, -2.2540, -2.3749, -2.2938, -2.2933,
         -2.2947],
        [-2.1073, -2.1909, -2.0165, -2.0810, -2.2301, -2.2634, -2.3351, -2.2654,
         -2.3373],
        [-2.1294, -2.0735, -2.1012, -2.0974, -2.2408, -2.2190, -2.3953, -2.2767,
         -2.2886],
        [-2.0858, -1.9972, -2.1066, -2.1894, -2.2293, -2.3118, -2.2929, -2.2812,
         -2.3369],
        [-2.0982, -2.0723, -2.1488, -2.1968, -2.0697, -2.3121, -2.3961, -2.2573,
         -2.2762],
        [-2.1762, -1.9784, -2.0795, -2.2048, -2.2208, -2.2622, -2.4025, -2.2582,
         -2.2511],
        [-2.0871, -2.1038, -2.1405, -2.2227, -2.1997, -2.3158, -2.3003, -2.2088,
         -2.2213],
        [-2.0393, -2.1191, -2.1812, -2.2145, -2.2709, -2.3560, -2.2422, -2.2480,
         -2.1390],
        [-2.0162, -2.2410, -2.1673, -2.1586, -2.3548, -2.3332, -2.2228, -2.1687,
         -2.1543]])


IndexError: string index out of range

In [155]:
training_data

[['EU',
  'NNP',
  'B-NP',
  'B-ORG',
  array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
         -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
          0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
         -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
         -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
          0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
         -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
          0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
          0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
         -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])],
 ['rejects',
  'VBZ',
  'B-VP',
  'O',
  array([-0.15761 , -0.13796 , -0.42215 ,  0.25714 ,  0.2735  ,  0.80252 ,
          0.75804 ,  0.25174 , -0.011099,  0.5511  , -0.15435 , -0.082309,
          0.084994, -0.23917 ,