In [47]:
import pandas as pd 
import numpy as np
import nltk.tokenize as nt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical  
from collections import Counter, OrderedDict
import re
from itertools import chain
import torch
import torch.nn as nn
import math
import random
from torch.optim import SGD,Adam
from torch.autograd import Variable

In [2]:
# IMDB movies review dataset 
df = pd.read_csv("IMDBDataset.csv")
df = df[:300]

In [3]:
# Extract movies reviews from the dataset in an array
sentences = df["review"].fillna("DUMMY_VALUES").values
filtered_sentence = []
word_index = {}
index_word = {}
lemma_function = WordNetLemmatizer()
idx = 1
print("Tokenize and creating word index....")
for sentence in sentences:
    # Removing useless charecters from the string 
    s = sentence.replace('<br />', '')
    filtered_sentence.append(s)
    
    # Tokenize every sentence in dataset
    tok_sentence = nt.word_tokenize(s.lower())
    
    # Lemmatizing the words in every sentence and creaing a word index dictionay
    # This Dictionary will maps numeric index values to words which can pe used later in the model 
    for word in tok_sentence:
        #word = lemma_function.lemmatize(word)
        if word not in word_index:
            word_index[word]=idx
            index_word[idx] = word
            idx+=1
print("Done")
print("Found %d words in the dataset" %len(word_index))
print("Found %d index in the dataset" %len(index_word))

Tokenize and creating word index....
Done
Found 10257 words in the dataset
Found 10257 index in the dataset


In [4]:
def flatten(l):
    r_l = []
    for i in l:
        if len(i)>0:
            temp = i.replace("<br />", "")
            r_l.append(temp.lower())
    return r_l

def count(sentences):
    f_sentences = flatten(sentences)
    counts = Counter()
    for i in f_sentences:
        counts.update(re.findall('\w+',i))
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    c = {}
    for i in counts:
        c[i[0]]=i[1]
    return c,counts
count_dict, count_set = count(sentences) 

In [5]:
CONTEXT_SIZE = 2 
EMB_DIM = 50
TABLE_SIZE = 1e8
alpha = 0.75
K = 5

In [6]:
def prob_frequency(count, t = 1e-5):
    prob = {}
    N = sum(x[1] for x in count)
    print ("Total number of words: ",N)
    for k,v in count:
        p = v/N
#         val = np.sqrt(t * N /v)
#         pk = val * (1 + val)
        pk = 1- np.sqrt(t/p)
        #pk = (np.sqrt(p/t) + 1) * (t/p)
        prob[k] = pk
    return prob

def subsampling(probs,filtered_sentence):
    train = []
    t= set()
    for sentence in filtered_sentence:
        tok_filtered_sentence = nt.word_tokenize(sentence.lower())
        temp = ""
        for word in tok_filtered_sentence:
            try:
                if np.random.random() < (1-probs[word]):
                    temp+=word + " "
                else:
                    t.add(word) 
            except KeyError:
                continue
        train.append(temp)
    print("Number of words removed: ",len(t))
    return train

def negative_sampling(train,alpha,TABLE_SIZE,K):
    l = []
    for i in train:
        tok = nt.word_tokenize(i)
        unigrams = ngrams(tok,1)
        for b in unigrams:
            l.append(b)
    c = Counter(l)
    TABLE_SIZE = int(TABLE_SIZE)
    s = sum(v for k,v in c.items())
    p_sum = sum((v/s)**alpha for k,v in c.items())
    neg_dict = {}
    for k,v in c.items():
        p = ((v/s)**alpha)/p_sum
        neg_dict[k[0]]=p
    table_count = []
    for k,v in neg_dict.items():
        count = np.round(v*TABLE_SIZE)
        table_count.append((k,count))
    idx = 0 
    inc= 0
    unigram_table = []
    N=len(table_count)
    for a in range(TABLE_SIZE):
        unigram_table.append(table_count[idx][0])
        if(inc == int(table_count[idx][1]) and idx < N):
            inc = 0
            idx+=1
        if(idx==N):
            break
        inc+=1
    return unigram_table
       # pass


probs = prob_frequency(count_set)
su = subsampling(probs, filtered_sentence)
neg_s = negative_sampling(su,alpha, TABLE_SIZE,K) 
    

Total number of words:  70717
Number of words removed:  4410


In [7]:
def index_lookup(word, w_i = word_index):
    return w_i[word]

def word_lookup(idx, i_w = index_word):
    return i_w[idx]

def get_neg_samples(unigram_table, k, TABLE_SIZE):
    rand = random.choices(range(TABLE_SIZE), k=K)
    neg_list =[]
    for i in rand:
        neg_list.append(index_lookup(unigram_table[i]))
    return neg_list

N = len(word_index)
def train_preprocess(sentence,N,unigram_table,K,TABLE_SIZE=int(TABLE_SIZE),CONTEXT_SIZE = CONTEXT_SIZE):
    input_target_pair = []
    neg_v = []
    for  s in sentence:
        tok_s = nt.word_tokenize(s)
        for i, word in enumerate(tok_s):
            for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
                try:
                    if cont_ran!=0 and i+cont_ran>=0:
                        i_word = index_lookup(word)
                        i_target = index_lookup(tok_s[i+cont_ran])
                        negs = get_neg_samples(unigram_table, K, TABLE_SIZE)
#                         temp_w = to_categorical(i_word,N)
#                         temp_n = to_categorical(i_neri,N)
                        temp = (torch.from_numpy(np.asarray(i_word)), torch.from_numpy(np.asarray(i_target)), torch.from_numpy(np.asarray(negs)))
#                         #print(temp)
                        input_target_pair.append(temp)   
                except (IndexError or TypeError):
                    continue
    return input_target_pair

tr = train_preprocess(su,N,neg_s,K)
tr


[(tensor(5, dtype=torch.int32),
  tensor(29, dtype=torch.int32),
  tensor([1405, 4927, 1695, 5455,  227], dtype=torch.int32)),
 (tensor(5, dtype=torch.int32),
  tensor(13, dtype=torch.int32),
  tensor([7483, 3304,   87, 6187, 8380], dtype=torch.int32)),
 (tensor(29, dtype=torch.int32),
  tensor(5, dtype=torch.int32),
  tensor([3303,   49,  126, 8651,  808], dtype=torch.int32)),
 (tensor(29, dtype=torch.int32),
  tensor(13, dtype=torch.int32),
  tensor([ 197,  702, 9656,  790, 3564], dtype=torch.int32)),
 (tensor(29, dtype=torch.int32),
  tensor(39, dtype=torch.int32),
  tensor([1568, 5378, 5318, 7541, 3626], dtype=torch.int32)),
 (tensor(13, dtype=torch.int32),
  tensor(5, dtype=torch.int32),
  tensor([8229, 8738,  893, 4907, 2256], dtype=torch.int32)),
 (tensor(13, dtype=torch.int32),
  tensor(29, dtype=torch.int32),
  tensor([8735, 7385, 2246,  412, 8954], dtype=torch.int32)),
 (tensor(13, dtype=torch.int32),
  tensor(39, dtype=torch.int32),
  tensor([ 7270, 10147,  7543,    89,  832

In [78]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.in_embeddings = nn.Embedding(vocab_size, emb_size)
        self.out_embeddings = nn.Embedding(vocab_size, emb_size)
        self.log_sigmoid = nn.LogSigmoid()

        
    def forward(self,word,target,negs):
        u = self.in_embeddings(word)
        v = self.out_embeddings(target)
        batch_size, embed_size = u.shape
        input_vectors = u.view(batch_size, embed_size, 1)
        output_vectors = v.view(batch_size, 1, embed_size)
        print(batch_size,embed_size)
#         print("word:\n",word)
        print("u:\n",u)
        print("input_vectors:\n",input_vectors)
        print("v:\n",v)
        print("output_vectors:\n",output_vectors)
        pos_vals = torch.bmm(output_vectors , input_vectors).sigmoid().log()
        pos_vals = pos_vals.squeeze()
        v_hat = self.out_embeddings(negs)
        neg_vector = v_hat.view(batch_size,5,embed_size)
        
        print("v_hat:\n",v_hat.shape)
        print("neg_vector:\n",neg_vector)
        neg_vals =  torch.bmm(neg_vector.neg(), input_vectors).sigmoid().log()
        neg_vals = neg_vals.squeeze().sum(1)
        print ("Pos vals:\n ",pos_vals)
#         print ("neg vals:\n ",neg_vals)
        l = -(pos_vals + neg_vals).mean()
       # print("l:\n",l)
#         return l

In [79]:
CUDA_LAUNCH_BLOCKING=1
trainloader = torch.utils.data.DataLoader(tr, batch_size=10)
net = SkipGram(N,EMB_DIM)
optimizer = Adam(net.parameters(), lr=0.003)
net=net.cuda()
# for i, data in enumerate(trainloader):
#     inputs, labels, negs = data
#     print(inputs)

In [80]:
for epoch in range(2):  # loop over the dataset multiple times
    avg_loss = 0
    for i, data in enumerate(trainloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels, negs = data
        inputs = Variable(inputs.long())
        labels = Variable(labels.long())
        negs = Variable(negs.long())
        inputs = inputs.cuda()
        labels = labels.cuda()
        negs = negs.cuda()
        #print(inputs,labels,negs)
        loss = net(inputs,labels,negs)
        # zero the parameter gradients 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
        # print statistics
        if i % 50 == 49:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, avg_loss))
            avg_loss = 0.0

print('Finished Training')

10 50
u:
 tensor([[ 0.1870, -0.6711, -0.9459, -0.7857, -0.1201, -0.3533, -1.6952,  2.3592,
          0.0613, -0.8563, -0.3167,  0.0788, -0.1293,  0.0631,  1.4945,  0.1612,
         -1.1706, -0.8008,  0.8246,  1.2182,  1.6526, -0.3058, -0.8200, -1.2762,
         -1.4063,  1.0117, -0.6833,  1.0025, -0.2844, -0.0775, -0.6223, -0.3936,
         -0.0482,  1.2170,  2.6426,  1.1085, -1.8926, -0.3598,  0.1249, -0.6727,
          1.5577,  0.0044, -1.0148, -1.1417,  0.2606,  0.6348,  0.0302,  0.0808,
         -0.3932, -1.1034],
        [ 0.1870, -0.6711, -0.9459, -0.7857, -0.1201, -0.3533, -1.6952,  2.3592,
          0.0613, -0.8563, -0.3167,  0.0788, -0.1293,  0.0631,  1.4945,  0.1612,
         -1.1706, -0.8008,  0.8246,  1.2182,  1.6526, -0.3058, -0.8200, -1.2762,
         -1.4063,  1.0117, -0.6833,  1.0025, -0.2844, -0.0775, -0.6223, -0.3936,
         -0.0482,  1.2170,  2.6426,  1.1085, -1.8926, -0.3598,  0.1249, -0.6727,
          1.5577,  0.0044, -1.0148, -1.1417,  0.2606,  0.6348,  0.0302,

Pos vals:
  tensor([-8.1671e-01, -1.9929e+01, -7.2876e+00, -2.3161e+01,  0.0000e+00,
        -4.6955e-01, -1.6120e-01, -3.3879e-01, -1.6689e-06, -7.5102e-06],
       device='cuda:0', grad_fn=<SqueezeBackward0>)


AttributeError: 'NoneType' object has no attribute 'backward'