In [1]:
import json
import math
import os
import random
from tqdm.notebook import tqdm, trange
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm.notebook import tqdm, trange
from sklearn.metrics import confusion_matrix

In [2]:
src_token = '<SRC>'
ref_token = '<REF>'
cnd_token = '<CND>'


In [3]:
sources    = []
references = []
candidates = []
scores     = []
labels     = []

In [4]:
with open('train.txt', 'r', encoding='utf8') as f:
    lines = f.read().split('\n')
    num_lines = len(lines)
    
    lines = (l for l in lines)
    
    for _ in range(num_lines // 6):
        sources.append(next(lines))
        references.append(next(lines))
        candidates.append(next(lines))
        scores.append(float(next(lines)))
        labels.append(next(lines))
        next(lines) # ignore newline char

In [5]:
# Check frequencies of each class
num_H = sum([1 if x == 'H' else 0 for x in labels])
num_M = sum([1 if x == 'M' else 0 for x in labels])

frac_pos = num_M / (num_H + num_M)
frac_neg = 1.0 - frac_pos
loss_weights = [1 / frac_neg, 1 / frac_pos]

print(num_H, num_M)
print('Baseline accuracy: ', num_H / (num_H + num_M) )
print(loss_weights)

312 272
Baseline accuracy:  0.5342465753424658
[1.8717948717948718, 2.1470588235294117]


In [6]:
# Join reference and candidate sentences
english_tokens = [(' %s ' % ref_token).join(ref_cand) for ref_cand in zip(references, candidates)]

In [7]:
print(english_tokens[0])

bahraini princess marries us soldier , astonishing 5 year bond comes to end <REF> bahraini princess marries a u.s. soldier ; astounding marriage dissolves in 5 years


In [8]:
# Use text8's word embeddings
import gensim
save_path = os.path.join(os.getcwd(), "../text8.model")
embeddings = gensim.models.Word2Vec.load(save_path)

In [9]:
def vectorize(sentences, labels, bleu_scores):
    '''
    Converts each token in each document in sentences to a vector using text8 word embeddings.
    
    Each sentence becomes a 256 x max_sentence_len matrix
    '''
    
    # Each token is turned into a 256 float vector
    embedding_length = 256
    max_sentence_len = max(len(s) for s in sentences)
    
    # For smaller sentences, each unused column in the matrix is padded
    pad_val = 100
    
    vectorized = []
    
    # Loop through every token in every sentence and convert to word embedding
    for sentence, label, score in zip(sentences, labels, bleu_scores):
        
        vector_array = torch.zeros(max_sentence_len, embedding_length)
        label_val    = torch.LongTensor([0 if label == 'H' else 1])
        bleu_score   = torch.FloatTensor([score])
        
        
        for i,word in enumerate(sentence):
            
            # Special case
            # if word == ref_token:
            #    vector_array[i,:] = torch.ones(embedding_length)
            
            # Unknown case
            if word not in embeddings: 
                continue
            
            # Known case
            vector_array[i,:] = torch.from_numpy(embeddings[word])
        
        vectorized.append( (vector_array, label_val, bleu_score) )
    
    return vectorized
            

In [10]:
vector_tokens = vectorize(english_tokens, labels, scores)




In [11]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.X = torch.cat([X.unsqueeze(0) for X, _, _ in data])
        self.y = torch.cat([y for _, y, _ in data])
        self.bleu = torch.cat([b for _, _, b in data])
        self.len = len(data)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        return self.X[index], self.y[index], self.bleu[index]

def get_data_loaders(train, val, batch_size=16):
    
    dataset = CustomDataset(train + val)

    train_indices = [i for i in range(len(train))]
    val_indices   = [i for i in range(len(train), len(train) + len(val))]

    train_sampler = SubsetRandomSampler(train_indices)
    train_loader  = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    
    val_sampler = SubsetRandomSampler(val_indices)
    val_loader  = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

    return train_loader, val_loader

In [12]:
class LSTM(nn.Module):
    def __init__(self, hidden=64, bi=False, drop=0.0, layers=1):
        super(LSTM, self).__init__()
        
        self.input_dim  = 256
        self.output_dim = 2
        self.hidden     = hidden
        self.bi         = bi
        
        self.lstm = nn.LSTM(input_size=self.input_dim,
                           hidden_size=self.hidden,
                           num_layers=layers,
                           bidirectional=bi,
                           dropout=drop)
        
        self.fc         = nn.Linear(1 + 2 * hidden, self.output_dim)
        self.activation = nn.ReLU()
        self.softmax    = nn.LogSoftmax(dim=1)
        self.loss       = nn.CrossEntropyLoss(torch.FloatTensor(loss_weights))
        self.dropout    = nn.Dropout(drop)
    
    def forward(self, inputs, bleu):
        
        batch_size = inputs.size()[0]
        embed_size = 256
        pad_val    = 100
        
        # A function to determine which columns in the sentence matrix are dummy paddings
        mask = lambda x: torch.nonzero(x != pad_val)
        
        # Get the true length of every sentence in the batch
        lengths = torch.tensor( [mask(inputs[i][:,0]).size()[0] for i in range(batch_size)] )
        
        inputs     = inputs.transpose(1, 0)
        input_size = inputs.size()
        
        # Remove all dummy padding sentences so only relevant data is fed into LSTM
        packed = nn.utils.rnn.pack_padded_sequence(inputs, lengths,
                                                  batch_first=False, enforce_sorted=False)
        
        output, (hn, _) = self.lstm(packed)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(output)
        
        if self.bi:
            fc_input = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
        else:
            fc_input = hn.squeeze(0)
        
        bleu = torch.reshape(bleu, (len(bleu), 1))
        fc_input = torch.cat((fc_input, bleu), 1)
        
        fc_output = self.fc(self.dropout(fc_input))
        predicted = self.softmax(fc_output)
        
        return predicted

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_epoch(model, train_loader, optimizer):
    
    # Set the model to train mode
    model.train()
    
    total_loss = 0
    correct    = 0
    total      = 0
    
    for (input_batch, expected_out, bleu) in tqdm(train_loader, leave=False, desc="Training Batches"):
        optimizer.zero_grad()
        
        outputs = model(input_batch.to(device), bleu.to(device)).to(device)
        total  += outputs.size()[0]
        
        _, predicted = torch.max(outputs, 1)
        
        # Calc how many correct predictions
        match = (expected_out.to('cpu') == predicted.to('cpu')).cpu().numpy().sum()
        correct += match
        
        loss = model.loss(outputs.to(device), expected_out.to(device))
        total_loss += loss
        
        # Gradient descent
        loss.backward()
        optimizer.step()
    
    print('Train Acc:', correct / total)
    print('Train loss:', float(total_loss))
    
def evaluate(model, val_loader):
    
    # Set the model to eval mode to avoid gradient calcs
    model.eval()
    
    true_pos  = 0
    true_neg  = 0
    false_pos = 0
    false_neg = 0
    
    for (input_batch, expected_out, bleu) in tqdm(val_loader, leave=False, desc="Validation"):

        outputs = model(input_batch.to(device), bleu.to(device)).to(device)
        _, predicted = torch.max(outputs, 1)
        loss = model.loss(outputs.to(device), expected_out.to(device))
        
        # Keep track of total confusion matrix statistics
        gold = expected_out.to('cpu').numpy()
        pred = predicted.to('cpu').numpy()
        CM   = confusion_matrix(gold, pred)
        true_neg  += CM[0][0]
        false_neg += CM[1][0]
        true_pos  += CM[1][1]
        false_pos += CM[0][1]
    
    # Calculate F1 score
    recall    = true_pos / (true_pos + false_neg)
    precision = true_pos / (true_pos + false_pos)
    f1        = 2.0 * precision * recall / (precision + recall)
    print('Validation F1 score:', f1)
    return f1

def train_and_evaluate(num_epochs, model, train_loader):
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    f1 = 0
    for epoch in trange(num_epochs, desc="Epochs"):
        train_epoch(model, train_loader, optimizer)
        f1 = evaluate(model, val_loader)
    return f1

In [None]:
# Cross validation
total_f1 = 0
for i in range(5):
    random.shuffle(vector_tokens)
    split = int(0.80 * len(vector_tokens))
    train_loader, val_loader = get_data_loaders(vector_tokens[:split], 
                                                vector_tokens[split:],
                                                batch_size=32)

    model = LSTM(hidden=64, bi=True, layers=3, drop=0.05).to(device)

    total_f1 += train_and_evaluate(20, model, train_loader)
    
print('Average F1 score:', total_f1 / 5.0)