In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Loading the dataset & Pre trained word embeddings

In [None]:
import os
import sys
import torch
import torchtext.data
import torchtext.datasets
import numpy as np 
import time
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

from cs236781.train_results import FitResult
from cs236781 import plot

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device =", device)

load the data set:

In [None]:
if os.sys.platform == 'linux':
    data_dir = os.path.expanduser('~/HW4/project/GloVe')
else:
    data_dir = os.path.expanduser('~/.pytorch-datasets')
    
# torchtext Field objects parse text (e.g. a review) and create a tensor representation

# This Field object will be used for tokenizing the movie reviews text
review_parser = torchtext.data.Field(
    sequential=True, use_vocab=True, lower=True,
    init_token='<sos>', eos_token='<eos>', dtype=torch.long,
    tokenize='spacy', tokenizer_language='en_core_web_sm'
)

# This Field object converts the text labels into numeric values (0,1,2)
label_parser = torchtext.data.Field(
    is_target=True, sequential=False, unk_token=None, use_vocab=True
)

# Load SST, tokenize the samples and labels
# ds_X are Dataset objects which will use the parsers to return tensors
ds_train, ds_valid, ds_test = torchtext.datasets.SST.splits(
    review_parser, label_parser, root=data_dir
)

n_train = len(ds_train)
print(f'Number of training   samples: {n_train}')
print(f'Number of validation samples: {len(ds_valid)}')
print(f'Number of test       samples: {len(ds_test)}')

As required, we'll use the pre-trained word embeddings of glove 6B.

In [None]:
#Vocabulary size is 40k, Embedding chosen size in 50
vocab, embeddings = [],[]
with open('./GloVe/glove.6B.200d.txt','rt',encoding='utf8') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)
    

add padding and unknown tokens to the embeddings array:

In [None]:
# Add the padding and the unknown tokens to the vocab and embeddings arrays

vocab = np.array(vocab) 
embeddings = np.array(embeddings)
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')

unk_emb = np.mean(embeddings, axis=0, keepdims=True)
pad_emb = np.zeros_like(embeddings[0]).reshape(1,-1)


embeddings = np.vstack((pad_emb, unk_emb, embeddings))

print(embeddings.shape)
print(vocab[:10])

## Train loop 

In [None]:
def train(model, optimizer, loss_fn, dl_train, dl_test, max_epochs=100,
          num_batches=400, print_every=1, save_path=None):
    
    best_test_acc = 0
    res = FitResult(max_epochs,[],[],[],[] )
    
    for epoch_idx in range(max_epochs):
        total_loss, num_correct = 0, 0
        num_samples = 0
        start_time = time.time()

        for batch_idx, batch in enumerate(dl_train):
            X, y = batch.text, batch.label

            # Forward pass
            _, y_pred_log_proba = model(X)
            # Backward pass
            optimizer.zero_grad()
            loss = loss_fn(y_pred_log_proba, y)
            loss.backward()

            # Weight updates
            optimizer.step()

            # Calculate accuracy
            total_loss += loss.item()
            y_pred = torch.argmax(y_pred_log_proba, dim=1)
            num_samples += y_pred.shape[0]
            num_correct += torch.sum(y_pred == y).float().item()

            if batch_idx == num_batches-1:
                break
        
        curr_test_loss, curr_test_acc = test_epoch(model, loss_fn, dl_test, print_acc=(epoch_idx % print_every == 0))
        res.test_loss.append(curr_test_loss)
        res.test_acc.append(curr_test_acc)
        
        curr_train_loss = total_loss /(num_batches)
        curr_train_acc = num_correct /(num_samples)
        res.train_loss.append(curr_train_loss)
        res.train_acc.append(curr_train_acc)
        
        if epoch_idx % print_every == 0:
            print(f"Epoch #{epoch_idx}, loss={curr_train_loss:.3f}, accuracy={curr_train_acc:.3f}, elapsed={time.time()-start_time:.1f} sec")
        
        if save_path and curr_test_acc > best_test_acc:
            if epoch_idx % print_every == 0:
                print("---saving model ---")
            torch.save(model, save_path)
            best_test_acc = curr_test_acc
            
    return res

In [None]:
def test_epoch(model, loss_fn, dataloader, print_acc=False):
    model.eval()
    total_loss = 0
    num_correct = 0 
    num_batches = 0
    num_samples = 0

    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            num_batches = batch_idx
            X, y = batch.text, batch.label

            _, y_test = model(X)
            loss = loss_fn(y_test, y)
            total_loss += loss.item()
            
            # Calculate accuracy
            y_pred = torch.argmax(y_test, dim=1)
            num_correct += torch.sum(y_pred == y).float().item()
            num_samples += y_pred.shape[0]

        num_batches += 1   
        
        test_loss = total_loss /(num_batches)
        test_acc = num_correct /(num_samples)
        if print_acc:
            print("Test Accuracy is ----", test_acc, "-----")

    model.train()
    return test_loss, test_acc

## Baseline Model - Sentiment Analysis using RNN - GRU

As for the first part in our experiment


In [None]:
BATCH_SIZE = 128
HIDDEN_SIZE = 100
NUM_LAYERS = 2
DROPOUT = 0.5
FREEZE_EMBEDDINGS = False

LOSS_WEIGHTS = [1., 1., 1.]

WEIGHT_DECAY = 0  # 1e-5
LEARNING_RATE = 2e-4
BETAS = (0.95, 0.98)  # this is the best for stable training, default is (0.99, 0.999), 

In [None]:
dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
    shuffle=True, device=device)
review_parser.build_vocab(ds_train)
label_parser.build_vocab(ds_train)

In [None]:
# defining the baseline model
from RNN import SentimentGRU

model = SentimentGRU(embeddings,
                     hidden_size=HIDDEN_SIZE,
                     num_layers=NUM_LAYERS,
                     dropout=DROPOUT,
                     freeze_embedding=FREEZE_EMBEDDINGS).to(device)

optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE,
                             weight_decay=WEIGHT_DECAY,
                             betas=BETAS)

loss_fn = torch.nn.NLLLoss()


In [None]:
fit_res = train(model, optimizer, loss_fn, dl_train, dl_test, max_epochs=100,
      num_batches=500, save_path = "./models/sentimentGRU.pt")

In [None]:
plot.plot_fit(fit_res)

In [None]:
# Load the best model achieved during training
model = torch.load("./models/sentimentGRU.pt")
print(model)

# Self Attention Model

In [None]:
# Self Attenttion Hyper parameters:

BATCH_SIZE = 128 # best so far
NUM_HEADS = 1 # does not matter alot
D_MODEL = 50 
TWO_LAYERS = False
FREEZE_EMB = False # weather to fine tune the embedding
DROPOUT = 0.8 # best with the combination of weight decay
KQV_DROPOUT = 0 # no dropout there in the paper
DENSE_DROPOUT = 0

LOSS_WEIGHT = [1., 1., 1.] # 

LEARNING_RATE = 1e-4  # best so far, dont change
WEIGHT_DECAY = 3e-3  # best so far, dont change
BETAS = (0.95, 0.98)  # this is the best for stable training, default is (0.99, 0.999), 

## Run before trainning new model from scratch:

In [None]:
from SelfAttention import SentimentSelfAttention

dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
    shuffle=True, device=device)
review_parser.build_vocab(ds_train)
label_parser.build_vocab(ds_train)

model_attention = SentimentSelfAttention(embeddings,
                                         d_model=D_MODEL,
                                         num_heads=NUM_HEADS,
                                         dropout=DROPOUT,
                                         kqv_dropout=KQV_DROPOUT,
                                         two_layers=TWO_LAYERS,
                                         freeze_embedding=FREEZE_EMB,
                                         dense_dropout=DENSE_DROPOUT).to(device)

att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY, betas=BETAS)

loss_fn = torch.nn.NLLLoss(weight=torch.tensor(LOSS_WEIGHT).to(device))

print("trainable params:", 
      sum(p.numel() for p in model_attention.parameters() if p.requires_grad)) 


Run only when trainning model that has been saved:

In [None]:
Attention_res = train(model_attention, att_optimizer, loss_fn, dl_train, dl_test, max_epochs=200,
          num_batches=500, save_path="./models/selfAttentionGlove200.pt")

In [None]:
model_attention = torch.load("./models/selfAttentionGlove200.pt")
att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE, 
                                 weight_decay=WEIGHT_DECAY, betas=BETAS)
# loss_fn = torch.nn.NLLLoss(weight=torch.tensor([1., 1., 2.]).to(device))
# model_attention.embedding_layer.weight.requires_grad = False

In [None]:
plot.plot_fit(Attention_res)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix, ConfusionMatrixDisplay

# get the best predictions from the model
def get_preds_and_labels(model, dl, device):
    model.eval()
    all_preds = torch.tensor([]).to(device)
    GT_labels = torch.tensor([]).to(device)

    for i,batch in enumerate(dl):
        X, y = batch
        
        _, preds = model(X)
        preds = torch.argmax(preds, dim=1)
        
        
        all_preds = torch.cat((all_preds, preds), dim=0)
        GT_labels = torch.cat((GT_labels, y), dim=0)
    model.train()
    print(all_preds)
    return all_preds , GT_labels

with torch.no_grad():
    # baseline VS SelfAttention Confusion matrices
    bl_best_preds, bl_GT_preds= get_preds_and_labels(model, dl_test, device)
    se_best_preds, se_GT_preds = get_preds_and_labels(model_attention, dl_test, device)
    
    # Now lets demonstrate the confusion matrix of the test set.
    IC = type('IdentityClassifier', (), {"predict": lambda i : i, "_estimator_type": "classifier"})
    class_names = ['Positive','Negative','Neutral']

    fig, (ax1, ax2) = plt.subplots(ncols = 2, nrows = 1 , figsize=(25, 10))
    disp = plot_confusion_matrix(IC, bl_GT_preds.cpu(), bl_best_preds.cpu() ,display_labels=class_names, cmap=plt.cm.Blues,  ax=ax1);
    disp.ax_.set_title('Baseline Confusion Matrix\n')

    disp = plot_confusion_matrix(IC, se_GT_preds.cpu(), se_best_preds.cpu() ,display_labels=class_names, cmap=plt.cm.Blues,  ax=ax2);
    disp.ax_.set_title('Self-Attention Confusion Matrix\n')

# normalize='true',    

## Experiments:


test the affect of model size

In [None]:
for d_model in [50, 70, 100, 150, 200]:
    dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
        (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
        shuffle=True, device=device)
    review_parser.build_vocab(ds_train)
    label_parser.build_vocab(ds_train)

    model_attention = SentimentSelfAttention(embeddings,
                                             d_model=D_MODEL,
                                             num_heads=NUM_HEADS,
                                             dropout=DROPOUT,
                                             kqv_dropout=KQV_DROPOUT,
                                             two_layers=TWO_LAYERS,
                                             freeze_embedding=FREEZE_EMB).to(device)

    att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE,
                                     weight_decay=WEIGHT_DECAY, betas=BETAS)

    loss_fn = torch.nn.NLLLoss(weight=torch.tensor([1., 1., 1.5]).to(device))
    
    plot.plot_fit(train(model_attention, att_optimizer, loss_fn, dl_train, dl_test, max_epochs=81,
          num_batches=500, print_every=10))

test different numbers of heads:

In [None]:
for h in [1, 2, 4, 10, 20]:
    dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
        (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
        shuffle=True, device=device)
    review_parser.build_vocab(ds_train)
    label_parser.build_vocab(ds_train)

    model_attention = SentimentSelfAttention(embeddings,
                                             d_model=D_MODEL,
                                             num_heads=NUM_HEADS,
                                             dropout=DROPOUT,
                                             kqv_dropout=KQV_DROPOUT,
                                             two_layers=TWO_LAYERS,
                                             freeze_embedding=FREEZE_EMB).to(device)

    att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE,
                                     weight_decay=WEIGHT_DECAY, betas=BETAS)

    loss_fn = torch.nn.NLLLoss(weight=torch.tensor([1., 1., 1.]).to(device))
    
    plot.plot_fit(train(model_attention, att_optimizer, loss_fn, dl_train, dl_test, max_epochs=81,
          num_batches=500, print_every=10))

## Utils

In [None]:
for name, p in model.named_parameters():
    if p.requires_grad:
        print(p.name, p.grad)

In [None]:
print(model_attention)
print(loss_fn.weight)

In [None]:
import gc
del model_attention
#del model
gc.collect()

test the affect of model size

In [None]:
for d_model in [50, 70, 100, 150, 200]:
    dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
        (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
        shuffle=True, device=device)
    review_parser.build_vocab(ds_train)
    label_parser.build_vocab(ds_train)

    model_attention = SentimentSelfAttention(embeddings,
                                             d_model=D_MODEL,
                                             num_heads=NUM_HEADS,
                                             dropout=DROPOUT,
                                             kqv_dropout=KQV_DROPOUT,
                                             two_layers=TWO_LAYERS,
                                             freeze_embedding=FREEZE_EMB).to(device)

    att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE,
                                     weight_decay=WEIGHT_DECAY, betas=BETAS)

    loss_fn = torch.nn.NLLLoss(weight=torch.tensor([1., 1., 1.5]).to(device))
    
    plot.plot_fit(train(model_attention, att_optimizer, loss_fn, dl_train, dl_test, max_epochs=81,
          num_batches=500, print_every=10), legend="d-model = "+str(d_model))

test different numbers of heads:

In [None]:
for h in [1, 2, 4, 10, 20]:
    dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
        (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
        shuffle=True, device=device)
    review_parser.build_vocab(ds_train)
    label_parser.build_vocab(ds_train)

    model_attention = SentimentSelfAttention(embeddings,
                                             d_model=D_MODEL,
                                             num_heads=NUM_HEADS,
                                             dropout=DROPOUT,
                                             kqv_dropout=KQV_DROPOUT,
                                             two_layers=TWO_LAYERS,
                                             freeze_embedding=FREEZE_EMB).to(device)

    att_optimizer = torch.optim.Adam(model_attention.parameters(), lr=LEARNING_RATE,
                                     weight_decay=WEIGHT_DECAY, betas=BETAS)

    loss_fn = torch.nn.NLLLoss(weight=torch.tensor([1., 1., 1.]).to(device))
    
    plot.plot_fit(train(model_attention, att_optimizer, loss_fn, dl_train, dl_test, max_epochs=81,
          num_batches=500, print_every=10))

## Utils

In [None]:
for name, p in model.named_parameters():
    if p.requires_grad:
        print(p.name, p.grad)

In [None]:
print(model_attention)
print(loss_fn.weight)

In [None]:
import gc
del model_attention
#del model
gc.collect()