<a href="https://colab.research.google.com/github/sunny9sinha/TSAI_Session_7/blob/main/Session7_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import sys
import pandas as pd
import numpy as np
import csv

In [3]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [60]:
sentences = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Session5/stanfordSentimentTreebank/datasetSentences.txt',sep='\t')
sentences.head()

Unnamed: 0,sentence_index,sentence
0,1,The Rock is destined to be the 21st Century 's...
1,2,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha..."


In [None]:
sentences.shape

(11855, 2)

In [61]:
sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Session5/stanfordSentimentTreebank/sentiment_labels.txt',sep='|',skiprows=[0],names=['phrase_id','sentiment_values'])
sentiment.head()

Unnamed: 0,phrase_id,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [62]:
sentiment_class = []
for i in sentiment['sentiment_values'] :
  if i >=0 and i<0.2:
    sentiment_class.append(1)
  elif i>=0.2 and i<0.4:
    sentiment_class.append(2)
  elif i>=0.4 and i<0.6:
    sentiment_class.append(3)
  elif i>=0.6 and i<0.8:
    sentiment_class.append(4)
  else:
    sentiment_class.append(5)

sentiment['sentiment_values'] = sentiment_class


In [63]:
sentiment.describe()
#sentiment = pd.DataFrame(sentiment)

Unnamed: 0,phrase_id,sentiment_values
count,239232.0,239232.0
mean,119615.5,3.062391
std,69060.474137,0.911298
min,0.0,1.0
25%,59807.75,3.0
50%,119615.5,3.0
75%,179423.25,4.0
max,239231.0,5.0


In [64]:
dictionary = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Session5/stanfordSentimentTreebank/dictionary.txt',sep='|', names=['phrase','phrase_id'])
dictionary.head()

Unnamed: 0,phrase,phrase_id
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [65]:
def pre_process_sentences(string):
  string=string.replace('-LRB-','(')
  string=string.replace('-RRB-',')')
  string=string.replace('Â', '')
  string=string.replace('Ã©', 'e')
  string=string.replace('Ã¨', 'e')
  string=string.replace('Ã¯', 'i')
  string=string.replace('Ã³', 'o')
  string=string.replace('Ã´', 'o')
  string=string.replace('Ã¶', 'o')
  string=string.replace('Ã±', 'n')
  string=string.replace('Ã¡', 'a')
  string=string.replace('Ã¢', 'a')
  string=string.replace('Ã£', 'a')
  string=string.replace('\xc3\x83\xc2\xa0', 'a')
  string=string.replace('Ã¼', 'u')
  string=string.replace('Ã»', 'u')
  string=string.replace('Ã§', 'c')
  string=string.replace('Ã¦', 'ae')
  string=string.replace('Ã­', 'i')
  string=string.replace('\xa0', ' ')
  string=string.replace('\xc2', '')
  return string

In [66]:
sentences['sentence'] = sentences['sentence'].apply(pre_process_sentences)

In [67]:
def pre_process_phrases(string):
    string=string.replace('é','e')
    string=string.replace('è','e')
    string=string.replace('ï','i')
    string=string.replace('í','i')
    string=string.replace('ó','o')
    string=string.replace('ô','o')
    string=string.replace('ö','o')
    string=string.replace('á','a')
    string=string.replace('â','a')
    string=string.replace('ã','a')
    string=string.replace('à','a')
    string=string.replace('ü','u')
    string=string.replace('û','u')
    string=string.replace('ñ','n')
    string=string.replace('ç','c')
    string=string.replace('æ','ae')
    string=string.replace('\xa0', ' ')
    string=string.replace('\xc2', '')    
    return string

In [68]:
dictionary['phrase'] = dictionary['phrase'].apply(pre_process_phrases)

In [71]:
dataset = pd.merge(sentiment,dictionary,on='phrase_id')
dataset = dataset.drop(columns=['phrase_id'])
print(dataset.head())

   sentiment_values               phrase
0                 3                    !
1                 3                    '
2                 3                  ' (
3                 3    ' ( the cockettes
4                 3  ' ( the cockettes )


In [73]:
dataset_1 = pd.merge(left = dataset, right = sentences,left_on='phrase',right_on='sentence')
dataset_1 = dataset_1.drop(columns=['phrase'])
dataset_1.rename(columns = {'sentiment_values':'sentiments'}, inplace = True)
print(dataset_1.shape)
print(dataset_1.head())
print(dataset.columns)


(11854, 3)
   sentiments  ...                                           sentence
0           1  ...              ... a bland murder-on-campus yawner .
1           2  ...  ... a hollow joke told by a cinematic gymnast ...
2           3  ...  ... the picture 's cleverness is ironically mu...
3           5  ...      classic cinema served up with heart and humor
4           2  ...              entertaining enough , but nothing new

[5 rows x 3 columns]
Index(['sentiment_values', 'phrase'], dtype='object')


In [74]:
def smallCase(data):
  for i in data.index:
    data['sentence'][i] = data['sentence'][i].lower()
  return data

In [75]:
import re
def cleanText(data):
  data_small_case = smallCase(data)
  for i in data_small_case.index:
    data_small_case.sentence[i] = re.sub("[^-9A-Za-z ]", "" , data_small_case.sentence[i])

  return data_small_case

dataset = cleanText(dataset_1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [77]:
dataset = dataset.drop(columns=['sentence_index'])
dataset.head(10)

Unnamed: 0,sentiments,sentence
0,1,a bland murder-on-campus yawner
1,2,a hollow joke told by a cinematic gymnast hav...
2,3,the picture s cleverness is ironically muted ...
3,5,classic cinema served up with heart and humor
4,2,entertaining enough but nothing new
5,5,insightfully written delicately performed
6,1,ordinary melodrama that is heavy on religious ...
7,5,a roller-coaster ride of a movie
8,5,there are enough moments of heartbreaking hon...
9,4,friends couples miles and all the pabst ...


In [78]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.3)
train = train.reset_index().drop(columns=['index'])
test = test.reset_index().drop(columns=['index'])

In [79]:
print(train.shape)
print(test.shape)

(8297, 2)
(3557, 2)


In [80]:
test.sentence[1:10]

1    niccol the filmmaker merges his collaborators ...
2                                       it s a trifle 
3    it aimlessly and unsuccessfully attempts to fu...
4                                      fun and nimble 
5                       originality is sorely lacking 
6                                  viva le resistance 
7                        a well-executed spy-thriller 
8    the inherent limitations of using a video game...
9    it s a rollicking adventure for you and all yo...
Name: sentence, dtype: object

In [81]:
# Import Library
import random
import torch, torchtext
from torchtext import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fd1c1873690>

In [82]:
sentence = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
sentiments = torchtext.legacy.data.Field(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [83]:
fields = [('sentence', sentence),('sentiments',sentiments)]

In [84]:
example = [torchtext.legacy.data.Example.fromlist([train.sentence[i],train.sentiments[i]], fields) for i in range(train.shape[0])] 

In [85]:
trainDataset = torchtext.legacy.data.Dataset(example, fields)

In [86]:
example = [torchtext.legacy.data.Example.fromlist([test.sentence[i],test.sentiments[i]], fields) for i in range(test.shape[0])] 

In [87]:
testDataset = torchtext.legacy.data.Dataset(example, fields)

In [88]:
vars(trainDataset.examples[10])

{'sentence': ['a',
  'visually',
  'flashy',
  'but',
  'narratively',
  'opaque',
  'and',
  'emotionally',
  'vapid',
  'exercise',
  'in',
  'style',
  'and',
  'mystification'],
 'sentiments': 2}

In [89]:
sentence.build_vocab(trainDataset)
sentiments.build_vocab(trainDataset)

In [90]:
print('Size of input vocab : ', len(sentence.vocab))
print('Size of label vocab : ', len(sentiments.vocab))
print('Top 10 words appreared repeatedly :', list(sentence.vocab.freqs.most_common(10)))
print('Labels : ', sentiments.vocab.stoi)

Size of input vocab :  15092
Size of label vocab :  6
Top 10 words appreared repeatedly : [(' ', 9361), ('the', 7219), ('a', 5155), ('and', 4418), ('of', 4337), ('to', 3006), ('-', 2739), ('s', 2512), ('is', 2489), ('it', 2446)]
Labels :  defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7fd1652bdad0>>, {'<unk>': 0, 4: 1, 2: 2, 3: 3, 5: 4, 1: 5})


In [91]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [92]:
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((trainDataset, testDataset), batch_size = 32, 
                                                            sort_key = lambda x: len(x.sentence),
                                                            sort_within_batch=True, device = device)

In [93]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(sentence.vocab.stoi, tokens)

In [94]:
# Define hyperparameters
size_of_vocab = len(sentence.vocab)
embedding_dim = 200
num_hidden_nodes = 300
num_output_nodes = 6
num_layers = 4
dropout = 0.4

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [95]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(15092, 200)
  (encoder): LSTM(200, 300, num_layers=4, batch_first=True, dropout=0.4)
  (fc): Linear(in_features=300, out_features=6, bias=True)
)
The model has 5,789,806 trainable parameters


In [96]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def categorical_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [97]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        # retrieve text and no. of words
        sentence, sentence_lengths = batch.sentence  
        # convert to 1D tensor
        predictions = model(sentence, sentence_lengths).squeeze()  
        # compute the loss
        loss = criterion(predictions, batch.sentiments)        
        # compute the binary accuracy
        acc = categorical_accuracy(predictions, batch.sentiments)   
        # backpropage the loss and compute the gradients
        loss.backward()       
        # update the weights
        optimizer.step()      
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [98]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sentence, sentence_lengths = batch.sentence
            
            # convert to 1d tensor
            predictions = model(sentence, sentence_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.sentiments)
            acc = categorical_accuracy(predictions, batch.sentiments)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [99]:
N_EPOCHS = 10
best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    
    # save the best model
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}% \n')

	Train Loss: 1.741 | Train Acc: 26.47%
	 Test. Loss: 1.715 |  Test. Acc: 30.17% 

	Train Loss: 1.706 | Train Acc: 32.45%
	 Test. Loss: 1.724 |  Test. Acc: 29.19% 

	Train Loss: 1.681 | Train Acc: 35.17%
	 Test. Loss: 1.694 |  Test. Acc: 32.52% 

	Train Loss: 1.651 | Train Acc: 38.59%
	 Test. Loss: 1.680 |  Test. Acc: 34.54% 

	Train Loss: 1.623 | Train Acc: 41.69%
	 Test. Loss: 1.681 |  Test. Acc: 34.20% 

	Train Loss: 1.594 | Train Acc: 45.05%
	 Test. Loss: 1.669 |  Test. Acc: 35.90% 

	Train Loss: 1.559 | Train Acc: 49.01%
	 Test. Loss: 1.677 |  Test. Acc: 33.77% 

	Train Loss: 1.526 | Train Acc: 52.89%
	 Test. Loss: 1.668 |  Test. Acc: 35.68% 

	Train Loss: 1.489 | Train Acc: 56.87%
	 Test. Loss: 1.663 |  Test. Acc: 36.29% 

	Train Loss: 1.456 | Train Acc: 60.22%
	 Test. Loss: 1.681 |  Test. Acc: 34.09% 



In [100]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_sentence(sentence):
    
    #categories = {0: "very negative", 1:"negative", 2:"neutral", 3:"positive", 4:"very positive"}
    #'<unk>': 0, 3: 1, 2: 2, 4: 3, 5: 4, 1: 5
    categories = {0: "unknown", 1:"neutral", 2:"negative", 3:"positive", 4:"very positive", 5:"very negative"}
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [101]:
for i in range(10):
  print(test.sentence[i])
  print("predicted: ", classify_sentence(test.sentence[i]))
  print("actual: ", test.sentiments[i])

at the film s centre is a precisely layered performance by an actor in his mid-seventies  michel piccoli 
predicted:  very positive
actual:  3
niccol the filmmaker merges his collaborators  symbolic images with his words  insinuating  for example  that in hollywood  only god speaks to the press
predicted:  negative
actual:  4
it s a trifle 
predicted:  very positive
actual:  3
it aimlessly and unsuccessfully attempts to fuse at least three dull plots into one good one 
predicted:  negative
actual:  1
fun and nimble 
predicted:  neutral
actual:  5
originality is sorely lacking 
predicted:  negative
actual:  2
viva le resistance 
predicted:  negative
actual:  4
a well-executed spy-thriller 
predicted:  positive
actual:  5
the inherent limitations of using a video game as the source material movie are once again made all too clear in this schlocky horroraction hybrid 
predicted:  negative
actual:  2
it s a rollicking adventure for you and all your mateys  regardless of their ages 
predict