In [24]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the dataset & Pre trained word embeddings

In [25]:
import os
import torch
import torchtext.data
import torchtext.datasets
import numpy as np 
import time

from RNN import SentimentGRU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device =", device)

device = cuda


load the data set:

In [26]:

data_dir = os.path.expanduser('~/.pytorch-datasets')

# torchtext Field objects parse text (e.g. a review) and create a tensor representation

# This Field object will be used for tokenizing the movie reviews text
review_parser = torchtext.data.Field(
    sequential=True, use_vocab=True, lower=True,
    init_token='<sos>', eos_token='<eos>', dtype=torch.long,
    tokenize='spacy', tokenizer_language='en_core_web_sm'
)

# This Field object converts the text labels into numeric values (0,1,2)
label_parser = torchtext.data.Field(
    is_target=True, sequential=False, unk_token=None, use_vocab=True
)

# Load SST, tokenize the samples and labels
# ds_X are Dataset objects which will use the parsers to return tensors
ds_train, ds_valid, ds_test = torchtext.datasets.SST.splits(
    review_parser, label_parser, root=data_dir
)

n_train = len(ds_train)
print(f'Number of training   samples: {n_train}')
print(f'Number of validation samples: {len(ds_valid)}')
print(f'Number of test       samples: {len(ds_test)}')

Number of training   samples: 8544
Number of validation samples: 1101
Number of test       samples: 2210


As required, we'll use the pre-trained word embeddings of glove 6B.

In [27]:
#Vocabulary size is 40k, Embedding chosen size in 50
vocab, embeddings = [],[]
with open('./GloVe/glove.6B.50d.txt','rt',encoding='utf8') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)
    

add padding and unknown tokens to the embeddings array:

In [28]:
# Add the padding and the unknown tokens to the vocab and embeddings arrays

vocab = np.array(vocab) 
embeddings = np.array(embeddings)
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')

unk_emb = np.mean(embeddings, axis=0, keepdims=True)
pad_emb = np.zeros_like(embeddings[0]).reshape(1,-1)


embeddings = np.vstack((pad_emb, unk_emb, embeddings))

print(vocab[:10])

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']


## Baseline Model - Sentiment Analysis using RNN - LSTM

As for the first part in our experiment




In [29]:
def train(model, optimizer, loss_fn, dataloader, max_epochs=100,
          num_batches=400, save_path=None):
    train_losses = []
    train_acc = []
    test_losses = []
    test_acc = []
    
    for epoch_idx in range(max_epochs):
        total_loss, num_correct = 0, 0
        start_time = time.time()

        for batch_idx, batch in enumerate(dataloader):
            X, y = batch.text, batch.label

            # Forward pass
            _, y_pred_log_proba = model(X)

            # Backward pass
            optimizer.zero_grad()
            loss = loss_fn(y_pred_log_proba, y)
            loss.backward()

            # Weight updates
            optimizer.step()

            # Calculate accuracy
            total_loss += loss.item()
            y_pred = torch.argmax(y_pred_log_proba, dim=1)
            num_correct += torch.sum(y_pred == y).float().item()

            if batch_idx == num_batches-1:
                break
        
        curr_train_loss = total_loss /(num_batches)
        curr_train_acc = num_correct /(num_batches*BATCH_SIZE)
        train_losses.append(curr_train_loss)
        train_acc.append(curr_train_acc)
        
        
        print(f"Epoch #{epoch_idx}, loss={curr_train_loss:.3f}, accuracy={curr_train_acc:.3f}, elapsed={time.time()-start_time:.1f} sec")
        
        if save_path:
            torch.save(model, save_path)

In [30]:
EMBEDDING_DIM = 50
BATCH_SIZE = 32
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0
LEARNING_RATE = 1e-3

In [31]:
dl_train, dl_valid, dl_test = torchtext.data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=BATCH_SIZE,
    shuffle=True, device=device)
review_parser.build_vocab(ds_train)
label_parser.build_vocab(ds_train)

In [35]:

model = SentimentGRU(embeddings, hidden_size=HIDDEN_SIZE,
                     num_layers=NUM_LAYERS, dropout=DROPOUT).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

loss_fn = torch.nn.NLLLoss()


In [33]:
model = torch.load("./models/sentimentGRU.pt")
print(model)

SentimentGRU(
  (embedding_layer): Embedding(400002, 50)
  (gru): GRU(50, 128)
  (dense_linear): Linear(in_features=128, out_features=3, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)


In [36]:
train(model, optimizer, loss_fn, dl_train, max_epochs=100,
      num_batches=500, save_path = "./models/sentimentGRU.pt")

Epoch #0, loss=0.561, accuracy=0.221, elapsed=1.5 sec
Epoch #1, loss=0.560, accuracy=0.223, elapsed=1.3 sec
Epoch #2, loss=0.560, accuracy=0.224, elapsed=1.3 sec
Epoch #3, loss=0.560, accuracy=0.225, elapsed=1.3 sec
Epoch #4, loss=0.560, accuracy=0.225, elapsed=1.3 sec
Epoch #5, loss=0.560, accuracy=0.226, elapsed=1.4 sec
Epoch #6, loss=0.560, accuracy=0.223, elapsed=1.4 sec
Epoch #7, loss=0.560, accuracy=0.222, elapsed=1.3 sec
Epoch #8, loss=0.557, accuracy=0.229, elapsed=1.3 sec
Epoch #9, loss=0.554, accuracy=0.242, elapsed=1.3 sec
Epoch #10, loss=0.548, accuracy=0.252, elapsed=1.3 sec
Epoch #11, loss=0.536, accuracy=0.270, elapsed=1.3 sec
Epoch #12, loss=0.513, accuracy=0.295, elapsed=1.3 sec
Epoch #13, loss=0.475, accuracy=0.322, elapsed=1.3 sec
Epoch #14, loss=0.413, accuracy=0.355, elapsed=1.4 sec
Epoch #15, loss=0.336, accuracy=0.391, elapsed=1.4 sec
Epoch #16, loss=0.269, accuracy=0.420, elapsed=1.4 sec
Epoch #17, loss=0.208, accuracy=0.449, elapsed=1.3 sec
Epoch #18, loss=0.13