In [147]:
from gensim import downloader
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
tokenize_model_1 = downloader.load('glove-wiki-gigaword-100')
tokenize_model_2 = downloader.load('word2vec-google-news-300')

In [253]:
# Pre-processing and tokenization
def load_data(path_to_data):
    sentences = []
    sentence = []
    pos = []

    # read data
    with open(path_to_data, 'r', encoding='utf-8') as f:
        data = f.readlines()

    # split data
    for line in data:
        if line != '\n':
            line_data = line.split('\t')
            # take only indexes 0,1,3,6
            word  = [int(line_data[0]), line_data[1], line_data[3], int(line_data[6])]
            pos.append(word[2])
            sentence.append(word)
        else:
            sentences.append([[0,"~","ROOT",0]] + sentence)
            sentence = []
    return sentences, list(set(pos + ["ROOT"]))


def pos_to_oneHot(pos_list):
    tensor_dim = len(pos_list)
    pos_to_vec = {}
    for pos in pos_list:
        one_hot_tensor = torch.zeros(tensor_dim)
        one_hot_tensor[pos_list.index(pos)] = 1
        pos_to_vec[pos] = one_hot_tensor

    return pos_to_vec


def tokenize(sentences, glove_model,w2v_model, glove_length, w2v_length, pos_list):
    pos_to_vec = pos_to_oneHot(pos_list)
    set_data = []
    tokenized_sen = []
    counter_zero = 0
    counter_all = 0
    flag_glove = True
    flag_w2v = True
    for sentence in sentences:
        for word in sentence:
            counter_all += 1
            if word[1] not in glove_model.key_to_index:
                flag_glove = False
                word_vec_1 = torch.zeros(glove_length)
            else:
                word_vec_1 = torch.Tensor(glove_model[word[1]].tolist())
            if word[1] not in w2v_model.key_to_index:
                flag_w2v = False
                word_vec_2 = torch.zeros(w2v_length)
            else:
                word_vec_2 = torch.Tensor(w2v_model[word[1]].tolist())
            if not flag_glove and not flag_w2v:
                counter_zero += 1

            word_vec = torch.cat((word_vec_1, word_vec_2))
            pos_vec = pos_to_vec[word[2]]
            final_vec = torch.cat((word_vec, pos_vec))
            tokenized_sen.append(final_vec)
            flag_glove = True
            flag_w2v = True

        set_data.append(torch.stack(tokenized_sen))
        tokenized_sen = []

    print(f"managed to tokenize {(1-(counter_zero/counter_all))*100} of the data")
    return set_data

def build_headers_target(sentence):
    target = [0]*len(sentence)
    for i in range(len(sentence)):
        target[i] = sentence[i][3]

    return torch.Tensor(target)

In [254]:
class scoring_nn(nn.Module):
    def __init__(self, input_size):
        super(scoring_nn, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size=125, num_layers=4, bidirectional=True, dropout=0.1)
        self.scoring = nn.Sequential(
            nn.Linear(500, 125),
            nn.ReLU(),
            nn.Linear(125, 1)
        )

    def init_first_hidden(self):
        return torch.zeros(8, 1, 125), torch.zeros(8, 1, 125)

    def possible_headers(self, sentence, current_index):
        all_pairs = []
        for i in range(len(sentence)):
            concat_vec = torch.cat((sentence[i], sentence[current_index]), dim=1)
            concat_vec = concat_vec.view(-1)
            all_pairs.append(concat_vec)
        return torch.stack(all_pairs)

    def forward(self, x):
        input = x.unsqueeze(1)
        h0, c0 = self.init_first_hidden()
        out, _ = self.lstm(input, (h0, c0))
        scoring_mat = torch.zeros(len(x), len(x))
        for i in range(len(out)):
            possible_headers = self.possible_headers(out, i)
            scores = []
            for concat_vec in possible_headers:
                score = self.scoring(concat_vec).tolist()[0]
                scores.append(score)
            scores_tensor = torch.Tensor(scores)
            scoring_mat[i] = F.log_softmax(scores_tensor, dim=0)

        scoring_mat.fill_diagonal_(0)
        return scoring_mat

In [262]:
def train_model(model, train_data, original_sentences, epochs):
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(epochs):
        for i, sentence in enumerate(train_data):
            labels = build_headers_target(original_sentences[i])
            optimizer.zero_grad()
            scores_mat = model(sentence)
            loss = criterion(scores_mat, labels.long())
            print(f"loss: {loss}")
            loss.backward()
            optimizer.step()
        print(f"epoch {epoch} loss: {loss.item()}")
    return model

In [189]:
def main():
    train_sentences, pos_train = load_data('train.labeled')
    test_sentences, pos_test = load_data('test.labeled')
    tokenized_train = tokenize(train_sentences, tokenize_model_1, tokenize_model_2, 100,300, pos_train)
    tokenized_test = tokenize(test_sentences, tokenize_model_1, tokenize_model_2, 100,300, pos_test)
    model = scoring_nn(tokenized_train[0][0].shape[0])
    model = train_model(model, tokenized_train,train_sentences, 10)

In [263]:
main()

managed to tokenize 98.79614127401737 of the data
managed to tokenize 99.12734452122409 of the data
loss: -0.04985513910651207


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn