In [1]:
# Import necessary libraries
import json
from nltk import word_tokenize
import numpy as np
import glob
from tqdm import tqdm
import nltk

In [2]:
# Visualization
from torch.utils.tensorboard import SummaryWriter

In [3]:
# You should upload the training data and GLoVe Embedding to your Google Drive

# Connect this Google Colab to your Google Drive storage (follow the instruction)
'''
from google.colab import drive
drive.mount('/content/drive')
'''
# Commented out because I am running on my own machine

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [4]:
# Check if an Nvidia GPU is available
# If not, read this tutorial to enable GPU on Google Colab
# https://www.tutorialspoint.com/google_colab/google_colab_using_free_gpu.htm
!nvidia-smi

Fri May 28 19:34:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.47       Driver Version: 466.47       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:65:00.0  On |                  N/A |
| 41%   50C    P0    47W / 215W |    784MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
def load_glove(path, dim=300):
    """
    GLoVe embedding is a way to map a word into a fixed-dimension vector.
    This function load the GLoVe embedding
    :param path:
    :param dim: dimesion of the word vector
    :return: a 2D numpy matrix and a dictionary that maps a word into index in the numpy matrix
    """
    matrix = []
    word_index = dict()

    # Add a zero vector of the same size as "<PAD>" token, index of 0
    # Add a random vector of the same size as "<UNK>" token, index of 1

    matrix.append([0.] * dim)
    matrix.append([0.] * dim)
    word_index['<PAD>'] = 0
    word_index['<UNK>'] = 1
    # Load from glove
    #
    with open(path, encoding='latin-1') as f:
        lines = f.readlines()
        for l in lines:
            parts = l.split(' ')
            vector = [float(x) for x in parts[1:]]
            matrix.append(vector)
            word_index[parts[0]] = len(word_index)

    matrix = np.array(matrix, dtype=np.float)
    return matrix, word_index

In [6]:
# Actually call the function to load the GLoVe
import os
matrix, word_index = load_glove('C:/Users/warre/hw4/glove.6B.50d.txt', 50)

In [7]:
# More libraries and download data for the word tokenizer.

import torch
from torch.utils.data import Dataset
import nltk
#nltk.download('punkt')
from nltk import word_tokenize

In [8]:
# A "Dataset" class manage the loading/preprocessing/sampling/packaging of input data
# It is used by a "DataLoader" in the later part of the code
class ImdbDataset(Dataset):

    def __init__(self, data_file_path, word_index, max_length):
        super(ImdbDataset, self).__init__()
        self.word_index = word_index
        # Paragraph max length
        self.ML = max_length
        # Load data from data_file_path
        self.data = load_json(data_file_path)
        # Target is an integer representing a class
        # E.g. label="positive" -> target=1
        #      label="negative" -> target=0
        target_map ={
            'positive': 1,
            'negative': 0
        }
        # Tokenize and initialize the target for each data point.
        for i, d in enumerate(self.data):
          # Tokenize paragraphs into words and punctuations
          # Each of the splitted string is called a "token"
          tokens = word_tokenize(d['text'].lower())

          # Indices stores the index of the token in the GLoVe embedding matrix
          indices = []
          for x in tokens:
            if x in word_index:
                indices.append(word_index[x])
            else:
                indices.append(word_index['<UNK>'])
          
          # Gather everything, and store them into self.data
          self.data[i]['token'] = tokens
          self.data[i]['indices'] = indices
          self.data[i]['target'] = target_map[d['label']]


    def __len__(self):
        # Return the length of the dataset, basically, the number of data points
        return len(self.data)

    def all_targets(self):
        # Return all the targets of the dataset, orderly.
        return [x['target'] for x in self.data]

    def __getitem__(self, idx):
        """
        :param idx: an index of a data point from the dataset.
        """
        # Just pick it from self.data
        item = self.data[idx]

        # Crop the sentence upto a certain length
        indices = item['indices'][:self.ML]

        # Pad sentence: append <pad_token_index> to the sentence which is shorter than maximum length.
        l = len(indices)
        if l < self.ML:
            indices += [0 for _ in range(self.ML - l)] # 0 is the index of a dummy pad token
            # Make sure that the sentence is cropped and padded correctly
        assert len(indices) == self.ML
        return {
            'indices': indices,
            'target': item['target']
        }

    @staticmethod
    def pack(items):
        """
        :param items: list of items, each item is an object returned from __getitem__ function
        :return:
        """
        # Pack item into batch
        # Each batch is a dictionary (similar to each item)
        batch = {
            'indices': torch.LongTensor([x['indices'] for x in items]),
            'target': torch.LongTensor([x['target'] for x in items])
        }
        return batch

In [9]:
import torch.nn as nn

# This is a simple version of the CNN for text classification proposed by Yoon Kim
# Access the paper here: https://arxiv.org/pdf/1408.5882.pdf
class BaseModel(nn.Module):

    def __init__(self, embedding_matrix, args):
        """
        :param: embedding_matrix: the GLoVe embedding matrix
        :param: args: an object of "Argument" class, this class is defined in the later part of the code
        """

        super(BaseModel, self).__init__()
        self.device = args.device
        hidden_size = args.hidden_size

        # create an embedding module
        N, D = embedding_matrix.shape
        self.embedding = nn.Embedding(N, D, _weight=torch.FloatTensor(embedding_matrix))

        # Disable gradient update of embedding
        self.embedding.weight.requires_grad = False
        self.embedding_dim = D


        # Define the layers
        self.conv = nn.Conv1d(D, hidden_size, kernel_size=args.kernel_size)
        self.max_pool = nn.MaxPool1d(args.max_length - args.kernel_size + 1)
        self.fc = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(args.dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(args.dropout),
            nn.Linear(args.hidden_size, args.n_class)
        )

    def forward(self, batch):
        # B denotes batch_size
        # L denotes sentence length
        # D denotes vector dimension

        # Get embedding
        embedding = self.embedding(batch['indices'].to(self.device))  # size of (B x L x D)
        # print('| embedding', tuple(embedding.shape))
        x = embedding.transpose(dim0=1, dim1=2)  # B x D x L
        # Feed through the neural network
        conv_x = self.conv(x)  # B x D x L
        # print('| conv_x', tuple(conv_x.shape))

        max_pool = self.max_pool(conv_x) # B x D x 1

        logits = max_pool.squeeze(dim=2)  # B x D
        # print('| logits', tuple(logits.shape))

        # Calculate the prediction
        predictions = torch.argmax(logits, dim=-1)

        return logits, predictions

In [10]:
# Calculate the accuracy score
# Do not change this
from sklearn.metrics import accuracy_score
def metrics(predictions: list, targets: list):
    """

    :param predictions:
    :param targets:
    :return:
    """
    return accuracy_score(targets, predictions)


In [11]:
def train_and_evaluate(model, train_dl, dev_dl, optimizer, args):
    """
    Implementation of stochastic gradient decent
    """
    loss_fn = torch.nn.CrossEntropyLoss()
    writer = SummaryWriter('./logs/experiment_F1_Adadelta_hs_11_ReLU_ml_1024')
    localMax = 0
    lowerTime = 0
    
    for e in range(args.epoch):

        # Training
        model.train()
        train_targets, train_preds = [], []
        for batch in train_dl:
            optimizer.zero_grad()
            logits, preds = model(batch)
            train_preds += preds.detach().cpu().numpy().tolist()
            targets = batch['target'].numpy().tolist()
            train_targets += targets
            loss = loss_fn(logits, batch['target'].to(args.device))
            loss.backward()
            optimizer.step()
        train_acc = metrics(train_preds, train_targets)

        # Evaluation
        model.eval()
        dev_targets, dev_preds = [], []
        for batch in dev_dl:
            s, preds = model(batch)
            dev_preds += preds.detach().cpu().numpy().tolist()
            targets = batch['target'].numpy().tolist()
            dev_targets += targets
        dev_acc = metrics(dev_preds, dev_targets)
        dev_loss = (1 / len(dev_preds)) * sum(((dev_targets[i] - dev_preds[i]) ** 2) for i in range (len(dev_preds)))
        train_loss = (1 / len(train_preds)) * sum(((train_targets[i] - train_preds[i]) ** 2) for i in range (len(train_preds)))
        
        # Early terminat
        
        if dev_acc > localMax: 
            localMax = dev_acc
            lowerTime = 0
        else:
            lowerTime += 1
        if lowerTime > 50:
            print('Early terminated')
            break
        
        
        
        # Logging the epoch and scores
        if e % 5 == 0:
            print(f'Epoch {e} Train={train_acc:.4f} Dev={dev_acc:.4f} Train_loss={train_loss:.4f} Dev_loss={dev_loss:.4f}')
        
        writer.add_scalars('acc', {'train': train_acc}, e)
        writer.add_scalars('acc', {'dev': dev_acc}, e)
        writer.add_scalars('loss', {'train': train_loss}, e)
        writer.add_scalars('loss', {'dev': dev_loss}, e)
        
    writer.close()
    print('Max acc: ', localMax)
        

In [12]:
def load_json(path):
    """
    Load a json file, return the data
    :param path:
    :return:
    """
    print('Loading', path, end=' ')
    with open(path, 'r', encoding='latin-1') as f:
        data = json.load(f)
    print(len(data))
    return data

In [13]:
CONST_MAX_LENGTH=1024    # Shorter = faster training, Longer=(possibly) higher accuracy
train_dataset = ImdbDataset('C:/Users/warre/hw4/train.json', word_index, CONST_MAX_LENGTH)

Loading C:/Users/warre/hw4/train.json 25000


In [14]:
dev_dataset = ImdbDataset('C:/Users/warre/hw4/dev.json', word_index, CONST_MAX_LENGTH)

Loading C:/Users/warre/hw4/dev.json 25000


In [15]:
# Define hyperparameters
# Feel free to define as much as you need
# This help the finetuning more organized

# ADJUST THESE HYPERPARAMETERS TO COMPLETE THE HOMEWORK

class Argument:
  n_class= 2    # Number of classes (dont change this)
  max_length=CONST_MAX_LENGTH #If you change this, you have to reload the train_dataset and dev_dataset

  glove= 'C:/Users/warre/hw4/glove.6B.300d.txt'  # GLoVe embedding version, try all given versions
  
  # Model arguments
  dropout= 0.5          # Dropout rate          Try [0.2:0.8]
  hidden_size= 100      # Hidden layer size     Try [64:512]
  kernel_size= 11        # CNN kernel size       Try [3,5,7,9,11]

  # Training arguments
  epoch= 1000           # Number of training epochs.  Try [20:200]
  lr= 0.01              # Learning rate               Try [1e-2:1e-4]
  batch_size= 50        # Batch size                  Try [32:128]
  optimizer= torch.optim.Adadelta        # Optimizer       Try [SGD, Adam, Adadelta]

args = Argument

# Setup the CUDA device
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
args = Argument
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Make sure the cuda is used
print('Using device: ', args.device)

Using device:  cuda


In [16]:
# Create a dataloader object
from torch.utils.data import DataLoader
train_dl = DataLoader(train_dataset, 
                      batch_size=args.batch_size, # Mini-batch
                      shuffle=True,               # Stochastic 
                      #num_workers=4,              # 4 external processes dedicated for preprocessing data
                      collate_fn=ImdbDataset.pack)# Pack separate samples into a batch
dev_dl = DataLoader(dev_dataset, 
                    batch_size=args.batch_size, 
                    shuffle=False,                # Don't shuffle in evaluation
                    #num_workers=4,
                    collate_fn=ImdbDataset.pack)
# Load GLoVe embedding
embedding_matrix, _ = load_glove(args.glove, dim=300)

# Create the model object
model = BaseModel(embedding_matrix, args)
# Send the model to GPU
model.to(args.device)

# Select all trainable parameters
params = [x for x in model.parameters() if x.requires_grad == True]
# Create an optimizer object
optimizer = torch.optim.Adadelta(params, lr=args.lr)


In [17]:
# Just print out to see the model architecture
print(model)
# Actual training
train_and_evaluate(model, train_dl, dev_dl, optimizer, args)
print('Done')

BaseModel(
  (embedding): Embedding(400002, 300)
  (conv): Conv1d(300, 100, kernel_size=(11,), stride=(1,))
  (max_pool): MaxPool1d(kernel_size=1014, stride=1014, padding=0, dilation=1, ceil_mode=False)
  (fc): Sequential(
    (0): ReLU()
    (1): Dropout(p=0.5, inplace=False)
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
    (5): Linear(in_features=100, out_features=2, bias=True)
  )
)
Epoch 0 Train=0.4960 Dev=0.5004 Train_loss=150.6640 Dev_loss=0.4996
Epoch 5 Train=0.6053 Dev=0.5866 Train_loss=0.3947 Dev_loss=0.4134
Epoch 10 Train=0.6390 Dev=0.6161 Train_loss=0.3610 Dev_loss=0.3839
Epoch 15 Train=0.6697 Dev=0.6488 Train_loss=0.3303 Dev_loss=0.3512
Epoch 20 Train=0.7092 Dev=0.6894 Train_loss=0.2908 Dev_loss=0.3106
Epoch 25 Train=0.7430 Dev=0.7296 Train_loss=0.2570 Dev_loss=0.2704
Epoch 30 Train=0.7713 Dev=0.7574 Train_loss=0.2287 Dev_loss=0.2426
Epoch 35 Train=0.7916 Dev=0.7809 Train_loss=0.2084 Dev_loss=0.2191
Epo