In [1]:
import pandas as pd
import gensim.downloader
import torch
import torch.nn as nn
import numpy as np
from nltk.tokenize import word_tokenize

In [2]:
trec_train = pd.read_csv('./TREC_data/train.csv')
trec_test = pd.read_csv('./TREC_data/test.csv')
for i in range(6):
    print(f'trec_train {i}: {len(trec_train[trec_train["label-coarse"]==i])}')
    # print(f'trec_test {i}: {len(trec_test[trec_test["label-coarse"]==i])}')

# preprocess datasets
trec_train.loc[((trec_train['label-coarse']==2) | (trec_train['label-coarse']==5)), 'label-coarse'] = 'OTHERS'
trec_test.loc[((trec_test['label-coarse']==2) | (trec_test['label-coarse']==5)), 'label-coarse'] = 'OTHERS'

# split train into train & dev; dev contains 500 unique samples
dev = trec_train.sample(n = 500,replace = False)
train_new = trec_train.drop(dev.index)

print(train_new)

trec_train 0: 1162
trec_train 1: 1250
trec_train 2: 86
trec_train 3: 1223
trec_train 4: 896
trec_train 5: 835
     label-coarse  label-fine  \
0               0           0   
1               1           1   
2               0           0   
3               1           2   
4          OTHERS           3   
...           ...         ...   
5447            1          14   
5448            1          46   
5449            4          41   
5450            4          41   
5451            1          46   

                                                   text  
0     How did serfdom develop in and then leave Russ...  
1      What films featured the character Popeye Doyle ?  
2     How can I find a list of celebrities ' real na...  
3     What fowl grabs the spotlight after the Chines...  
4                       What is the full form of .com ?  
...                                                 ...  
5447            What 's the shape of a camel 's spine ?  
5448           What type of c

In [3]:
# use the pretrained word embeddings 
google_news = gensim.downloader.load('word2vec-google-news-300')

In [4]:
vocab_dict = google_news.key_to_index
vocab_dict['<PAD>'] = vocab_dict[list(vocab_dict.keys())[-1]]+1
w2v_vec = google_news.vectors
w2v_vec = np.append(w2v_vec, [np.zeros(300)], axis=0)
print(w2v_vec[vocab_dict['<PAD>']])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [30]:
# design neural network using the pretrained word embeddings
class QuestionClassifier(nn.Module):
    def __init__(self, w2v_vec, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super(QuestionClassifier, self).__init__()

        # embed with pretrained weights
        self.embedding = nn.Embedding.from_pretrained(embeddings = torch.FloatTensor(w2v_vec), freeze = True, padding_idx = vocab_dict['<PAD>'])
        
        # bidirectional RNN layer (LSTM) -- google_news.vector_size is the input size/embedding dimensions
        self.rnn = nn.LSTM(google_news.vector_size, hidden_dim, num_layers = num_layers, bidirectional = bidirectional, dropout = dropout, batch_first=True)
        
        # fully connected layer for classification
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        # output layer
        self.softmax = nn.Softmax()
        
        # dropout layer to prevent overfitting
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # embedding layer -- text: input sentence/question
        embedded = self.embedding(text)
        
        # RNN layer
        outputs, (hidden, cell) = self.rnn(embedded)
        
        # aggregate by taking the mean/average, max value or sum
        # aggregate = torch.mean(outputs, dim=1)
        # aggregate, indices = torch.max(outputs, dim=1) 
        aggregate =  torch.sum(outputs, dim=1)

        # fully connected layer
        output = self.fc(aggregate)
        
        # apply softmax to predict probabilities
        softmax_output = self.softmax(output)

        return softmax_output

In [31]:
# to convert sentences into tensors
def text_to_tensor(text, vocab_dict):
    input_text = []
    max_len = len((max([word_tokenize(i) for i in text], key=len)))
    for i in text:
        tokens = word_tokenize(i)
        # find indices of tokens from google_news w2v (drop unknown words)
        text_idx = [vocab_dict[token] for token in tokens if token in vocab_dict]
        # add padding
        text_idx = text_idx + [vocab_dict['<PAD>']] * (max_len - len(text_idx))
        input_text.append(text_idx)
        # print(input_text)
    input_text_tensor = torch.LongTensor(input_text)
    return input_text_tensor

In [32]:
# define train and test loops to be used in each epoch
def training_loop(model, x_train, y_train, batch_size, optimizer, loss_fn):
    running_loss = 0

    for size in range(0, len(x_train), batch_size):
        # obtain input data and corresponding labels
        text, labels = x_train[size:size+batch_size], y_train[size:size+batch_size]
        
        # convert text and labels into tensors
        input_text = text_to_tensor(text, vocab_dict) # using word embedding and aggregation???
        # input_labels = torch.as_tensor(labels.to_numpy(dtype=np.long), dtype=torch.long)
        input_labels = torch.as_tensor(labels.to_numpy(dtype=np.float64), dtype=torch.long)
        
        # forward pass
        softmax_output = model.forward(input_text)
        softmax_output = softmax_output.type(torch.FloatTensor)
        # input_labels = input_labels.type(torch.FloatTensor)

        loss = loss_fn(softmax_output, input_labels) # CrossEntropy expects float inputs and long labels??
        running_loss += loss.item()

        # zero the gradients
        optimizer.zero_grad()
        # backpropagation and optimization
        loss.backward()
        optimizer.step()
        # lr_scheduler.step()
    
    running_loss = running_loss / len(x_train)
        
    return running_loss


def testing_loop(model, x_test, y_test):
    test_accuracy = 0

    with torch.no_grad():
        for i in x_test.index:
            input_text = text_to_tensor([x_test[i]], vocab_dict)
            # input_labels = torch.as_tensor(y_test[i], dtype=torch.int)
            pred = model.forward(input_text)
            # print(pred.argmax())
            # print(y_test[i])
            test_accuracy += (pred.argmax() == y_test[i]).type(torch.float).sum().item()

    test_accuracy = float(test_accuracy) / len(x_test)
    
    return test_accuracy

In [33]:
class EarlyStopper:
    def __init__(self, patience, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_acc = -np.inf

    def early_stop(self, validation_acc): # min_validation_acc is the basis of comparison (the new max val_accuracy)
        if validation_acc > self.min_validation_acc:
            self.min_validation_acc = validation_acc
            self.counter = 0
        elif validation_acc <= (self.min_validation_acc + self.min_delta): # if val_accuracy does not improve for <patience> times, early stopper is enabled
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [34]:
# define parameters (higher hidden_dim, smaller learning_rate, higher no of epochs)
hidden_dim = 30 # no neurons in each layer
output_dim = 5 # no of classes
num_layers = 3
bidirectional = True
dropout = 0.2
learning_rate = 0.0001
weight_decay = 0.0000001

# initialise other variables
model = QuestionClassifier(w2v_vec, hidden_dim, output_dim, num_layers, bidirectional, dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
early_stopper = EarlyStopper(patience=5)
loss_fn = nn.CrossEntropyLoss() # for multiclass classification
batch_size = 32
num_epochs = 80

In [35]:
# split data into x and y
x_train = train_new['text']
y_train = train_new['label-coarse']
x_dev = dev['text']
y_dev = dev['label-coarse']

train_small_1 = train_new[:10000]
x_train_small = train_small_1['text']
y_train_small = train_small_1['label-coarse']
dev_small = dev[:3000]
x_dev_small = dev_small['text']
y_dev_small = dev_small['label-coarse']

# train_small_1 = train_new[((train_new['label-coarse']==1) | (train_new['label-coarse']==3))]
# x_train_small = train_small_1['text']
# y_train_small = train_small_1['label-coarse']
# print(len(y_train_small[y_train_small==1]))
# dev_small = dev[((dev['label-coarse']==1) | (dev['label-coarse']==3))]
# x_dev_small = dev_small['text']
# y_dev_small = dev_small['label-coarse']

# transform 'others' label to 2
y_train_small = y_train_small.replace('OTHERS', 2)
y_dev_small = y_dev_small.replace('OTHERS', 2)

# transform 'others' label to 2
y_train = y_train.replace('OTHERS', 2)
y_dev = y_dev.replace('OTHERS', 2)


x_test = trec_test['text']
y_test = trec_test['label-coarse']
y_test = y_test.replace('OTHERS', 2)

# getting distribution of data
for i in range(5):
    print(f'y_train {i}: {len(y_train[y_train==i])}')
for i in range(5):
    print(f'y_dev {i}: {len(y_dev[y_dev==i])}')

y_train 0: 1044
y_train 1: 1125
y_train 2: 853
y_train 3: 1120
y_train 4: 810
y_dev 0: 118
y_dev 1: 125
y_dev 2: 68
y_dev 3: 103
y_dev 4: 86


In [36]:
# file = open("./q2_mean.txt", "a")
# file = open("./q2_max.txt", "a")
file = open("./q2_sum.txt", "a")
print("Epoch, Train loss, Validation accuracy", file=file)

import time 

print("Training starting...")
start_t = time.time()
accuracy = []
for epoch in range(num_epochs):
    train_loss = training_loop(model, x_train, y_train, batch_size, optimizer, loss_fn)
    val_accuracy = testing_loop(model, x_dev, y_dev)

    accuracy.append(val_accuracy)
    print(f"Epoch {epoch+1}: Train loss = {train_loss}, Validation accuracy = {val_accuracy}")

    print(epoch+1, train_loss, val_accuracy, file=file)

    if epoch >= 1 and early_stopper.early_stop(val_accuracy): # early stopper is done on test/validation datasets
        print(f"Early Stopping at {epoch+1} epochs...") 
        break
    

end_t = time.time()
print("Training finished!")

print("Time taken for training with {} epochs: {}seconds".format(epoch+1, end_t - start_t))
print("Time taken for training with {} epochs: {}seconds".format(epoch+1, end_t - start_t), file = file)


print("Testing starting...")
test_accuracy = testing_loop(model, x_test, y_test)
print(f"Test accuracy = {test_accuracy}")


print(f"Test accuracy = {test_accuracy}", file=file)
file.close()

Training starting...
Epoch 1: Train loss = 0.05030913032314504, Validation accuracy = 0.328
Epoch 2: Train loss = 0.04932729192138296, Validation accuracy = 0.396
Epoch 3: Train loss = 0.04640949460917028, Validation accuracy = 0.494
Epoch 4: Train loss = 0.04494996544617636, Validation accuracy = 0.528
Epoch 5: Train loss = 0.043582868282359716, Validation accuracy = 0.574
Epoch 6: Train loss = 0.04162421762365517, Validation accuracy = 0.614
Epoch 7: Train loss = 0.039211541125385364, Validation accuracy = 0.642
Epoch 8: Train loss = 0.03771410564709942, Validation accuracy = 0.672
Epoch 9: Train loss = 0.03665730023759632, Validation accuracy = 0.686
Epoch 10: Train loss = 0.036172728473997655, Validation accuracy = 0.712
Epoch 11: Train loss = 0.03565348717286244, Validation accuracy = 0.718
Epoch 12: Train loss = 0.0353535564679514, Validation accuracy = 0.736
Epoch 13: Train loss = 0.03499431070391697, Validation accuracy = 0.73
Epoch 14: Train loss = 0.03478640821041699, Validat

In [40]:
# compare different aggregate methods
agg_methods = ["mean", "max", "sum"]

for i in range(3):

    with open("./q2_{}.txt".format(agg_methods[i]), "r") as file:
        lines = file.readlines() 
        lines_to_read = lines[1:-2] 
        items = [line.split() for line in lines_to_read]
        train_loss = [float(x) for x in (line[1] for line in items)]
        avg1 = "{:.3f}".format(sum(train_loss) / len(train_loss))
        max1 = "{:.3f}".format(max(train_loss))
        val_accuracy = [float(x) for x in (line[2] for line in items)]
        avg2 = "{:.3f}".format(sum(val_accuracy) / len(val_accuracy))
        max2 = "{:.3f}".format(max(val_accuracy))
        test_line = lines[-1]
        test_acc = test_line.split()[-1]
    print("For {}\n\tAverage train loss: {}\n\tMax train loss: {}\n\tAverage validation accuracy: {}\n\tMax validation accuracy: {}\n\tTest accuracy: {}\n".format(agg_methods[i], avg1, max1, avg2, max2, test_acc))

# so SUM is the best aggregate method...?

For mean
	Average train loss: 0.040
	Max train loss: 0.050
	Average validation accuracy: 0.634
	Max validation accuracy: 0.758
	Test accuracy: 0.772

For max
	Average train loss: 0.038
	Max train loss: 0.050
	Average validation accuracy: 0.647
	Max validation accuracy: 0.758
	Test accuracy: 0.794

For sum
	Average train loss: 0.036
	Max train loss: 0.050
	Average validation accuracy: 0.710
	Max validation accuracy: 0.808
	Test accuracy: 0.812



In [60]:
# to print out validation accuracies for sum training

file_acc = open("./q2_sum_val_accuracies.txt", "a")

with open("./q2_sum.txt".format(agg_methods[i]), "r") as file:
    lines = file.readlines() 
    lines_to_read = lines[1:-2] 

    for line in lines_to_read:
        items = line.split()
        epoch = items[0]
        val_accuracy = float(items[2])
        print(f"Epoch {epoch}: {val_accuracy}", file=file_acc)


file.close()
