In [1]:
pip install torchsummary 

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import re
import string
from time import time 

import torch
import torch.nn as nn
from tqdm import tqdm
from torchsummary import summary

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### IMPORTING THE DATASET

In [3]:
nltk.download('stopwords')
punct = string.punctuation
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = pd.read_csv("/kaggle/input/imdb-movie-reviews/imdb.csv")     
data.head(10)

Unnamed: 0,text,polarity
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1
5,"memory ""The Last Hunt"" stuck since saw 1956 13...",1
6,"Shakespeare fan, appreciate Ken Branagh done b...",0
7,privilege watching Scarface big screen beautif...,1
8,real classic. shipload sailors trying get town...,1
9,Serials short subjects originally shown theate...,1


In [5]:
negative_data = data[data['polarity']==0]
positive_data = data[data['polarity']==1]
dataset = pd.concat([negative_data[:50], positive_data[:50]], axis=0)

In [6]:
dataset

Unnamed: 0,text,polarity
1,"Put aside Dr. House repeat missed, Desperate H...",0
3,watched horrid thing TV. Needless say one movi...,0
6,"Shakespeare fan, appreciate Ken Branagh done b...",0
10,strange sex comedy there`s little comedy whole...,0
11,"many problems film, worst continuity; re-editi...",0
...,...,...
81,Summer Phoenix great performance really feel s...,1
83,Domini Enfilren (Marlene Dietrich) spent life ...,1
84,"movie all, action, fighting, dancing, bull rid...",1
85,"enjoyed watching Cliffhanger, beginning woman ...",1


**I will perform the task only on 50 positive and 50 negative tweets, so that my notebook doesn't run out of memory allocated to the sparse vectors**

### TEXT PREPOCESSSING AND VOCAB BUILDING
(I am ignoring for the specific requirements of preprocessing for this task)

In [7]:
def preprocessing(string, stopwords, stemmer):
    '''We can do all the preprocessing in just one step by creating a pipeline
    First, we have to make all the words in lowercase,
    then we have to tokenize the string,
    then we have to remove stopwords and 
    finally we have to stem all the words.
    This is how it will be ready to be analyzed further'''
    string  = string.lower()
    tokens = re.split('\s|(?<!\d)[,.](?!\d)', string)
    clean_tokens = []
    for word in tokens:
        if word not in stopwords:
            clean_tokens.append(word)
    
    stemmed_words = []
    for word in clean_tokens:
        stemmed_words.append(stemmer.stem(word))
    
    preprocessed_array = []
    for word in stemmed_words:
        if word!='':
            preprocessed_array.append(word)
            
    return preprocessed_array
#########################################
dataset['preprocessed'] = dataset['text'].apply(lambda x:preprocessing(x, stopwords_english, stemmer))    
dataset.head(10)

Unnamed: 0,text,polarity,preprocessed
1,"Put aside Dr. House repeat missed, Desperate H...",0,"[put, asid, dr, hous, repeat, miss, desper, ho..."
3,watched horrid thing TV. Needless say one movi...,0,"[watch, horrid, thing, tv, needless, say, one,..."
6,"Shakespeare fan, appreciate Ken Branagh done b...",0,"[shakespear, fan, appreci, ken, branagh, done,..."
10,strange sex comedy there`s little comedy whole...,0,"[strang, sex, comedi, there`, littl, comedi, w..."
11,"many problems film, worst continuity; re-editi...",0,"[mani, problem, film, worst, continuity;, re-e..."
12,Rosie wasted lot TV time talking Tainos super ...,0,"[rosi, wast, lot, tv, time, talk, taino, super..."
20,"awful, awful! old room mate used watch junk dr...",0,"[aw, awful!, old, room, mate, use, watch, junk..."
24,Filmfour going lot better little snot film the...,0,"[filmfour, go, lot, better, littl, snot, film,..."
25,"60s (1999) D: Mark Piznarski. Josh Hamilton, J...",0,"[60, (1999), d:, mark, piznarski, josh, hamilt..."
26,"show suck? Unfortunately, really question, dou...",0,"[show, suck?, unfortun, realli, question, doub..."


In [8]:
def build_frequency(processed_strings, polarities):
    vocab = {}
    for string, polarity in zip(processed_strings, polarities):
        for word in string:
            pair = (word, polarity)
            if pair in vocab:
                vocab[pair]+=1
            else:
                vocab[pair] = 1
    return vocab
#####################################
vocab = build_frequency(dataset['preprocessed'], dataset['polarity'])

**First we have to figure out word2indices and indices2word dictionary for one hot encodings and form the final data**

In [9]:
def find_unique_words(vocab):
    unique_words = set()
    for key in vocab.keys():
        unique_words.add(key[0])
    return unique_words


unique_words = find_unique_words(vocab)
###################################
def word2ind_ind2word(unique_words):
    # 
    word2ind = {}
    ind2word  ={}
    for index, word in enumerate(unique_words):
        word2ind[word] = index
        ind2word[index] = word
    return word2ind, ind2word

word2ind, ind2word = word2ind_ind2word(unique_words)
########################################    

In [10]:
len(unique_words), len(word2ind), len(ind2word)

(4078, 4078, 4078)

**Now we have to form the data, first we have to form the data for a single string with the help of sliding windows and word2index dictionary**

In [11]:
# hyperparameters
C=2
intermediate_length = 300
one_hot_length = len(unique_words)

def data_for_one_processed_tweet(processed_tweet, word2ind=word2ind, in2word=ind2word, C=2, one_hot_length=one_hot_length):    
    # string will be a one token list
    data_x = []
    data_y = []
    for i in range(C, len(processed_tweet)-C):
        x = np.zeros(one_hot_length, dtype=np.float32)
        temp_list = processed_tweet[i-C:i] + processed_tweet[i+1:i+C+1]
        for word in temp_list:
           x[word2ind[word]]+= 1
        x/=(2*C)
        data_x.append(x)
        data_y.append(word2ind[processed_tweet[i]])
    return data_x, data_y

######################################
def full_data(processed_tweets):
    data_x = []
    data_y = []
    for processed_tweet in processed_tweets:
        x,y = data_for_one_processed_tweet(processed_tweet)
        data_x+= x
        data_y+= y
    data_x = np.array(data_x, dtype=np.float32)
    data_y = np.array(data_y, dtype=np.int32)
    data_y = data_y.reshape(data_y.shape[0], 1)
    return np.concatenate((data_x, data_y), axis=1) 

### **Model for CBOW**
* Now for capturing the context of the words we will apply the cbow model where c=2 and take the intermediate length of the vectors to be 300

In [12]:
class CBOW(nn.Module):
    def __init__(self, length, intermediate_len):
        super(CBOW, self).__init__()
        layers = []
        layers.append(nn.Linear(in_features=length, out_features=intermediate_len))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(in_features=intermediate_len, out_features=length))
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
cbow = CBOW(length=len(unique_words), intermediate_len=300).to(device)
summary(cbow, (len(unique_words), ))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 300]       1,223,700
              ReLU-2                  [-1, 300]               0
            Linear-3                 [-1, 4078]       1,227,478
Total params: 2,451,178
Trainable params: 2,451,178
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 0.04
Params size (MB): 9.35
Estimated Total Size (MB): 9.40
----------------------------------------------------------------


### **UTILS**

In [14]:
def save_checkpoint(state, filename='my_checkpoint.pth.tar'):
    # will save model and optimizer params at every epoch
    print("-> Saving CheckPoint")
    torch.save(state, filename)

In [15]:
def load_checkpoint(checkpoint, model):
    # it will just load, we can train it further, make changes to the architecture
    # and simply use it to predict
    print("-> Loading CheckPoint")
    model.load_state_dict(checkpoint["state_dict"])

In [16]:
def train(loader, model, optimizer, loss_fn, scaler, device=device):
    '''
    it is the training procedure for one epoch of the network
    '''
    num_batches = len(loader)
    batches = tqdm(loader) # tqdm will be used to generate progress bars
    acc = 0
    for idx, batch in enumerate(batches, 0):
        inp = batch[:, :-1].type(torch.float32).to(device)  # shape is (32, len(unique_words))
        target = batch[:, -1].type(torch.LongTensor).to(device) # shape is (32,)
        
        # forward
        #with torch.cuda.amp.autocast(): # for gradient underflowing and overflowing and it makes training faster by converting all floats to float16
        pred = model(inp)
        loss = loss_fn(pred, target) 
            
        optimizer.zero_grad()  # making all the previous gradients zero 
        #scaler.scale(loss).backward()
        loss.backward()
        #scaler.step(optimizer)
        #scaler.update()
        optimizer.step()
        
        
        model.eval()
        with torch.no_grad():
            acc += (torch.argmax(pred, dim=1)==target).sum().item()/batch_size
        batches.set_postfix(loss = loss.item()) # loss of this current batch on current iteration 
        model.train()

    acc/=num_batches
    print(f"Validation accuracy is {acc*100}")

### **DRIVER CODE**

In [17]:
# hyperparameters
lr = 1e-3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
num_workers = 2
num_epochs = 20 # just for testing if loop works fine else i am gonna set it to 100

# set up the input data
train_data = full_data(dataset['preprocessed']) 
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True,
                                           num_workers=num_workers)


#setting up the model, scaler, optimizer, loss_fn
model = CBOW(length=len(unique_words), intermediate_len=300).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scaler = torch.cuda.amp.GradScaler()

In [18]:
# now training 
for epoch in range(num_epochs):
    train(train_loader, model, optimizer, loss_fn, scaler)
    
    # save checkpoints 
    checkpoint = {
        'state_dict': model.state_dict(),
        'optimizer':optimizer.state_dict()
    }
    save_checkpoint(checkpoint)
    

100%|██████████| 350/350 [00:02<00:00, 132.19it/s, loss=7.94]


Validation accuracy is 2.0
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 121.53it/s, loss=7.24]


Validation accuracy is 3.9553571428571432
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 136.26it/s, loss=7.8] 


Validation accuracy is 4.803571428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 129.61it/s, loss=7.21]


Validation accuracy is 4.866071428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 117.23it/s, loss=7.78]


Validation accuracy is 4.883928571428572
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 119.40it/s, loss=6.82]


Validation accuracy is 5.348214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.00it/s, loss=5.86]


Validation accuracy is 5.866071428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 135.72it/s, loss=5.12]


Validation accuracy is 6.964285714285714
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 125.39it/s, loss=5.71]


Validation accuracy is 9.883928571428571
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 131.48it/s, loss=4.26]


Validation accuracy is 16.178571428571427
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.16it/s, loss=4.05]


Validation accuracy is 25.455357142857142
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 135.92it/s, loss=3.6] 


Validation accuracy is 35.732142857142854
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 131.61it/s, loss=2.94]


Validation accuracy is 49.6875
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 123.45it/s, loss=1.79]


Validation accuracy is 65.17857142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 136.42it/s, loss=1.45] 


Validation accuracy is 80.15178571428572
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 132.20it/s, loss=0.89] 


Validation accuracy is 88.16071428571428
-> Saving CheckPoint


100%|██████████| 350/350 [00:03<00:00, 105.27it/s, loss=0.526]


Validation accuracy is 92.91964285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.84it/s, loss=0.367]


Validation accuracy is 95.54464285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 135.81it/s, loss=0.2]  


Validation accuracy is 96.95535714285714
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 136.44it/s, loss=0.342]


Validation accuracy is 97.97321428571428
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 120.50it/s, loss=0.32] 


Validation accuracy is 98.625
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.62it/s, loss=0.138] 


Validation accuracy is 99.00892857142857
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 130.42it/s, loss=0.175] 


Validation accuracy is 99.21428571428571
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 134.26it/s, loss=0.127] 


Validation accuracy is 99.42857142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 118.83it/s, loss=0.0912]


Validation accuracy is 99.50892857142857
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 136.40it/s, loss=0.0456]


Validation accuracy is 99.61607142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 134.86it/s, loss=0.0298]


Validation accuracy is 99.61607142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:03<00:00, 110.30it/s, loss=0.0289]


Validation accuracy is 99.63392857142857
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 124.63it/s, loss=0.0362]


Validation accuracy is 99.6875
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 133.73it/s, loss=0.0261]


Validation accuracy is 99.72321428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 136.31it/s, loss=0.0277]


Validation accuracy is 99.70535714285714
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 126.74it/s, loss=0.0186]


Validation accuracy is 99.75
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 123.73it/s, loss=0.0241]


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 135.14it/s, loss=0.0149] 


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 135.54it/s, loss=0.0171] 


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 129.81it/s, loss=0.00922]


Validation accuracy is 99.74107142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 119.97it/s, loss=0.0175] 


Validation accuracy is 99.75892857142857
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 133.96it/s, loss=0.00674]


Validation accuracy is 99.72321428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:03<00:00, 109.15it/s, loss=0.00707]


Validation accuracy is 99.74107142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 132.21it/s, loss=0.00574]


Validation accuracy is 99.75
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.39it/s, loss=0.0079] 


Validation accuracy is 99.74107142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 133.69it/s, loss=0.00312]


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 130.78it/s, loss=0.00739]


Validation accuracy is 99.75892857142857
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 134.93it/s, loss=0.00917]


Validation accuracy is 99.71428571428571
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 125.88it/s, loss=0.00234]


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 127.94it/s, loss=0.000932]


Validation accuracy is 99.75
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 134.53it/s, loss=0.00506] 


Validation accuracy is 99.72321428571429
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 128.51it/s, loss=0.00156] 


Validation accuracy is 99.74107142857143
-> Saving CheckPoint


100%|██████████| 350/350 [00:02<00:00, 132.33it/s, loss=0.00123] 


Validation accuracy is 99.75
-> Saving CheckPoint


100%|██████████| 350/350 [00:03<00:00, 110.76it/s, loss=0.00757] 


Validation accuracy is 99.73214285714286
-> Saving CheckPoint


In [31]:
w1 = model.state_dict()['model.0.weight']
w2 = model.state_dict()['model.2.weight']
print(w1.shape, w2.shape)
weights = w1 + w2.T
print(weights.shape)

torch.Size([300, 4078]) torch.Size([4078, 300])
torch.Size([300, 4078])


## **Now we have to simply assign the embeddings to their words**

In [38]:
def embeddings(weights, ind2word = ind2word):
    weights = weights.detach().cpu().numpy()
    embed = {}
    for key in ind2word.keys():
        embed[ind2word[key]] = weights[:, key]
    return embed

######################
word_embeddings = embeddings(weights)

### **TESTING THE QUALITY OF WORD EMBEDDINGS THROUGH VISUALIZATION AND ANALOGIES**
- We can apply many things here, first of all we can simply check if the classes are predicted correct or not on a given example.
- Then we can apply PCA to the embddings and plot them to visulaize correlations between the words and get a understanding of how correct the embddings are.
- As embeddings are built for use for some task, the ultimate check of them is to apply to that particular task and then get feedback from there, like if i wished to design embeddings for question-answering then i will train model and then how good the model is doing in question-answering, this will ultimately tell us the quality of our embedding