In [129]:
import json
import math
import os
from pathlib import Path
import random
import time
from tqdm.notebook import tqdm, trange
from typing import Dict, List, Set, Tuple
import string
import re
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm.notebook import tqdm, trange
from sklearn.metrics import confusion_matrix

In [16]:
src_token = '<SRC>'
ref_token = '<REF>'
cnd_token = '<CND>'


In [7]:
sources    = []
references = []
candidates = []
scores     = []
labels     = []

In [14]:
with open('train.txt', 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    num_lines = len(lines)
    
    lines = (l for l in lines)
    
    for _ in range(num_lines // 6):
        sources.append(next(lines))
        references.append(next(lines))
        candidates.append(next(lines))
        scores.append(float(next(lines)))
        labels.append(next(lines))
        next(lines) # ignore newline char

In [106]:
num_H = sum([1 if x == 'H' else 0 for x in labels])
num_M = sum([1 if x == 'M' else 0 for x in labels])
print(num_H, num_M)
print('Baseline: ', num_H / (num_H + num_M) )

1872 1632
Baseline:  0.5342465753424658


In [20]:
english_tokens = [(' %s ' % ref_token).join(ref_cand) for ref_cand in zip(references, candidates)]

In [21]:
print(english_tokens[0])

bahraini princess marries us soldier , astonishing 5 year bond comes to end <REF> bahraini princess marries a u.s. soldier ; astounding marriage dissolves in 5 years


In [24]:
import gensim
save_path = os.path.join(os.getcwd(), "../text8.model")
embeddings = gensim.models.Word2Vec.load(save_path)

In [96]:
def vectorize(sentences, labels):
    
    embedding_length = 256
    pad_val = 100
    max_sentence_len = max(len(s) for s in sentences)
    
    vectorized = []
    
    
    for sentence, label in zip(sentences, labels):
        
        vector_array = torch.zeros(max_sentence_len, embedding_length)
        label_val = torch.LongTensor([0 if label == 'H' else 1])
        
        
        for i,word in enumerate(sentence):
            
            # Unknown case
            if word not in embeddings: continue
            
            vector_array[i,:] = torch.from_numpy(embeddings[word])
        
        vectorized.append( (vector_array, label_val) )
    
    return vectorized
            

In [97]:
vector_tokens = vectorize(english_tokens, labels)
random.shuffle(vector_tokens)



In [98]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.X = torch.cat([X.unsqueeze(0) for X, _ in data])
        self.y = torch.cat([y for _, y in data])
        self.len = len(data)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

def get_data_loaders(train, val, batch_size=16):
    
    dataset = CustomDataset(train + val)

    train_indices = [i for i in range(len(train))]
    val_indices   = [i for i in range(len(train), len(train) + len(val))]

    train_sampler = SubsetRandomSampler(train_indices)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    
    val_sampler = SubsetRandomSampler(val_indices)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

    return train_loader, val_loader

In [99]:
split = int(0.80 * len(vector_tokens))
train_loader, val_loader = get_data_loaders(vector_tokens[:split], 
                                            vector_tokens[split:],
                                            batch_size=32)

In [100]:
class LSTM(nn.Module):
    def __init__(self, hidden=64, bi=False, drop=0.0, layers=1):
        super(LSTM, self).__init__()
        
        self.input_dim = 256
        self.output_dim = 2
        
        self.hidden = hidden
        
        self.lstm = nn.LSTM(input_size=self.input_dim,
                           hidden_size=self.hidden,
                           num_layers=layers,
                           bidirectional=bi,
                           dropout=drop)
        
        self.fc = nn.Linear(2 * hidden, self.output_dim)
        
        self.activation = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()
        self.dropout = nn.Dropout(drop)
    
    def forward(self, inputs):
        
        batch_size = inputs.size()[0]
        embed_size = 256
        pad_val    = 100
        
        mask = lambda x: torch.nonzero(x != pad_val)
        
        lengths = torch.tensor( [mask(inputs[i][:,0]).size()[0] for i in range(batch_size)] )
        
        inputs = inputs.transpose(1, 0)
        input_size = inputs.size()
        
        h0 = torch.randn(1, input_size[1], self.hidden)
        
        packed = nn.utils.rnn.pack_padded_sequence(inputs, lengths,
                                                  batch_first=False, enforce_sorted=False)
        
        output, (hn, _) = self.lstm(packed)
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(output)
        
        
        fc_input = self.dropout(torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1))
        fc_output = self.fc(fc_input)
        predicted = self.softmax(fc_output)
        
        return predicted

In [135]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_epoch(model, train_loader, optimizer):

    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for (input_batch, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):
        optimizer.zero_grad()
        
        outputs = model(input_batch.to(device)).to(device)
        total += outputs.size()[0]
        
        _, predicted = torch.max(outputs, 1)
        
        match = (expected_out.to('cpu') == predicted.to('cpu')).cpu().numpy().sum()
        correct += match
        
        loss = model.loss(outputs.to(device), expected_out.to(device))
        total_loss += loss
        
        loss.backward()
        optimizer.step()
    
    print('Acc:', correct / total)
    print(total_loss)
    
def evaluate(model, val_loader):
    model.eval()
    
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    
    for (input_batch, expected_out) in tqdm(val_loader, leave=False, desc="Validation"):

        outputs = model(input_batch.to(device)).to(device)
        _, predicted = torch.max(outputs, 1)
        
        
        gold = expected_out.to('cpu').numpy()
        pred = predicted.to('cpu').numpy()
        
        CM = confusion_matrix(gold, pred)

        true_neg += CM[0][0]
        false_neg += CM[1][0]
        true_pos += CM[1][1]
        false_pos += CM[0][1]
    
    recall    = true_pos / (true_pos + false_neg)
    precision = true_pos / (true_pos + false_pos)
    f1 = 2.0 * precision * recall / (precision + recall)
    print('F1:', f1)
    
def train_and_evaluate(num_epochs, model, train_loader):
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in trange(num_epochs, desc="Epochs"):
        train_epoch(model, train_loader, optimizer)
        evaluate(model, val_loader)

In [137]:
print(device)
model = LSTM(hidden=64, bi=True, layers=2).to(device)

train_and_evaluate(20, model, train_loader)

cuda


HBox(children=(FloatProgress(value=0.0, description='Epochs', max=20.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.5301462718515876
tensor(60.9031, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: nan




HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.5722440242597218
tensor(59.7463, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.4878957169459963


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.6278986799857296
tensor(56.7315, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.5364963503649636


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.6525151623260792
tensor(54.6720, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.5529010238907851


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.6889047449161613
tensor(51.5379, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.6308243727598567


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.7338565822333214
tensor(46.6672, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.6666666666666666


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.7531216553692472
tensor(43.4881, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.6375


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.7887977167320728
tensor(40.1431, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.7357859531772575


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.8319657509810917
tensor(33.0967, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.8169440242057489


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.8230467356403853
tensor(33.3975, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.8246153846153846


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.8747770246164823
tensor(24.3895, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.8151986183074266


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.8901177310024974
tensor(21.3854, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.8612903225806451


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.91616125579736
tensor(16.7819, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.8963317384370015


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9336425258651445
tensor(14.4534, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9475357710651828


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9418480199785944
tensor(12.7506, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9651898734177214


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9311452015697467
tensor(15.2784, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9528301886792452


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9732429539778809
tensor(6.6722, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9719626168224299


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.984659293613985
tensor(3.9219, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9795918367346939


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9782376025686764
tensor(5.3797, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.870307167235495


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=88.0, style=ProgressStyle(descript…

Acc: 0.9275775954334642
tensor(19.2446, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, description='Validation', max=22.0, style=ProgressStyle(description_wi…

F1: 0.9782608695652173

