In [1]:
import pandas as pd 
import numpy as np
import nltk.tokenize as nt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical  
from collections import Counter, OrderedDict
import re
from itertools import chain
import torch
import torch.nn as nn
import math
import random

Using TensorFlow backend.


In [2]:
# IMDB movies review dataset 
df = pd.read_csv("IMDBDataset.csv")
df = df[:100]

In [3]:
# Extract movies reviews from the dataset in an array
sentences = df["review"].fillna("DUMMY_VALUES").values
filtered_sentence = []
word_index = {}
lemma_function = WordNetLemmatizer()
idx = 1
print("Tokenize and creating word index....")
for sentence in sentences:
    # Removing useless charecters from the string 
    s = sentence.replace('<br />', '')
    filtered_sentence.append(s)
    
    # Tokenize every sentence in dataset
    tok_sentence = nt.word_tokenize(s.lower())
    
    # Lemmatizing the words in every sentence and creaing a word index dictionay
    # This Dictionary will maps numeric index values to words which can pe used later in the model 
    for word in tok_sentence:
        #word = lemma_function.lemmatize(word)
        if word not in word_index:
            word_index[word]=idx
            idx+=1
print("Done")
print("Found %d words in the dataset" %len(word_index))

Tokenize and creating word index....
Done
Found 5021 words in the dataset


In [5]:
def flatten(l):
    r_l = []
    for i in l:
        if len(i)>0:
            temp = i.replace("<br />", "")
            r_l.append(temp.lower())
    return r_l

def count(sentences):
    f_sentences = flatten(sentences)
    counts = Counter()
    for i in f_sentences:
        counts.update(re.findall('\w+',i))
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    c = {}
    for i in counts:
        c[i[0]]=i[1]
    return c,counts
count_dict, count_set = count(sentences) 

In [12]:
CONTEXT_SIZE = 2 
EMB_DIM = 50
TABLE_SIZE = 1e8
alpha = 0.75
K = 5

In [6]:

input_target_pair = []

for sentence in filtered_sentence:
    tok_filtered_sentence = nt.word_tokenize(sentence.lower())
    for i, word in enumerate(tok_filtered_sentence):
        #print(word)
        for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
            try:
                if cont_ran!=0 and i+cont_ran>=0:
                    i_word = index_lookup(word)
                    i_neri = index_lookup( tok_filtered_sentence[i+cont_ran])
                    temp_w = to_categorical(i_word,N)
                    temp_n = to_categorical(i_neri,N)
                    temp = (torch.from_numpy(np.asarray(temp_w)), torch.from_numpy(np.asarray(temp_n)))
                    #print(temp)
                    input_target_pair.append(temp)   
            except IndexError:
                continue  
    
    
input_target_pair[:5]

[(tensor([0., 1., 0.,  ..., 0., 0., 0.]),
  tensor([0., 0., 1.,  ..., 0., 0., 0.])),
 (tensor([0., 1., 0.,  ..., 0., 0., 0.]),
  tensor([0., 0., 0.,  ..., 0., 0., 0.])),
 (tensor([0., 0., 1.,  ..., 0., 0., 0.]),
  tensor([0., 1., 0.,  ..., 0., 0., 0.])),
 (tensor([0., 0., 1.,  ..., 0., 0., 0.]),
  tensor([0., 0., 0.,  ..., 0., 0., 0.])),
 (tensor([0., 0., 1.,  ..., 0., 0., 0.]),
  tensor([0., 0., 0.,  ..., 0., 0., 0.]))]

In [43]:
def prob_frequency(count, t = 1e-5):
    prob = {}
    N = sum(x[1] for x in count)
    print ("Total number of words: ",N)
    for k,v in count:
        p = v/N
        val = np.sqrt(t * N /v)
        pk = val * (1 + val)
        #pk = (np.sqrt(p/t) + 1) * (t/p)
        prob[k] = pk
    return prob

def subsampling(probs,filtered_sentence):
    train = []
    t= set()
    for sentence in filtered_sentence:
        tok_filtered_sentence = nt.word_tokenize(sentence.lower())
        temp = ""
        for word in tok_filtered_sentence:
            try:
                if np.random.random() < (probs[word]):
                    temp+=word + " "
                else:
                    t.add(word) 
            except KeyError:
                continue
        train.append(temp)
    print("Number of words removed: ",len(t))
    return train

def negative_sampling(train,alpha,TABLE_SIZE,K):
    l = []
    for i in train:
        tok = nt.word_tokenize(i)
        bigrams = ngrams(tok,1)
        for b in bigrams:
            l.append(b)
    c = Counter(l)
    TABLE_SIZE = int(TABLE_SIZE)
    s = sum(v for k,v in c.items())
    p_sum = sum((v/s)**alpha for k,v in c.items())
    neg_dict = {}
    for k,v in c.items():
        p = ((v/s)**alpha)/p_sum
        neg_dict[k[0]]=p
    table_count = []
    for k,v in neg_dict.items():
        count = np.round(v*TABLE_SIZE)
        table_count.append((k,count))
    idx = 0 
    inc= 0
    unigram_table = []
    N=len(table_count)
    for a in range(TABLE_SIZE):
        unigram_table.append(table_count[idx][0])
        if(inc == int(table_count[idx][1]) and idx < N):
            inc = 0
            idx+=1
        if(idx==N):
            break
        inc+=1
       # pass
    rand = random.choices(range(TABLE_SIZE), k=K)
    neg_list =[]
    for i in rand:
        print(unigram_table[i])
        neg_list.append(unigram_table[i])
    return neg_list


probs = prob_frequency(count_set)
su = subsampling(probs, filtered_sentence)
neg_s = negative_sampling(su,alpha, TABLE_SIZE,K)
neg_s  
    

Total number of words:  23490
Number of words removed:  2457
serial
previous
sex
seeking
floating


['serial', 'previous', 'sex', 'seeking', 'floating']

In [45]:
def index_lookup(word, w_i = word_index):
    return w_i[word]

N = len(word_index)
def train_preprocess(sentence,N,neg_list,CONTEXT_SIZE = CONTEXT_SIZE):
    input_target_pair = []
    neg_v = []
    for  s in sentence:
        tok_s = nt.word_tokenize(s)
        for i, word in enumerate(tok_s):
            for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
                try:
                    if cont_ran!=0 and i+cont_ran>=0:
                        i_word = index_lookup(word)
                        i_neri = index_lookup(tok_s[i+cont_ran])
                        temp_w = to_categorical(i_word,N)
                        temp_n = to_categorical(i_neri,N)
                        temp = (torch.from_numpy(np.asarray(temp_w)), torch.from_numpy(np.asarray(temp_n)))
                        #print(temp)
                        input_target_pair.append(temp)   
                except (IndexError or TypeError):
                    continue
    for n in neg_list:
        print(n)
        i_word = index_lookup(n)
        temp_w = to_categorical(i_word,N)
        t = torch.from_numpy(np.asarray(temp_w))
        neg_v.append(t)
    return input_target_pair, neg_v
t,neg = train_preprocess(su,N,neg_s)
neg

serial
previous
sex
seeking
floating


[tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.])]

In [47]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.u_embeddings = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.v_embeddings = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.n_negs = n_negs
        self.init_emb()
        
    def init_emb(self):
        """Initialize embedding weight like word2vec.
            The u_embedding is a uniform distribution in [-0.5/em_size, 0.5/emb_size], and the elements of v_embedding are zeroes.
            Returns:None
        """
        initrange = 0.5 / self.emb_size
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        #print(self.u_embeddings.weight.data.uniform_(-initrange, initrange))

In [49]:
trainloader = torch.utils.data.DataLoader(t, batch_size=10, shuffle=True)
SkipGram(N,EMB_DIM)

SkipGram(
  (u_embeddings): Embedding(5021, 50, sparse=True)
  (v_embeddings): Embedding(5021, 50, sparse=True)
)