In [123]:
import pandas as pd 
import numpy as np
import nltk.tokenize as nt
from nltk.stem.wordnet import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical  
from collections import Counter, OrderedDict
import re
import torch
import torch.nn as nn
import math
import random

In [133]:
# IMDB movies review dataset 
df = pd.read_csv("IMDBDataset.csv")
df = df[:1000]

In [134]:
# Extract movies reviews from the dataset in an array
sentences = df["review"].fillna("DUMMY_VALUES").values
filtered_sentence = []
word_index = {}
lemma_function = WordNetLemmatizer()
idx = 1
print("Tokenize and creating word index....")
for sentence in sentences:
    # Removing useless charecters from the string 
    s = sentence.replace('<br />', '')
    filtered_sentence.append(s)
    
    # Tokenize every sentence in dataset
    tok_sentence = nt.word_tokenize(s.lower())
    
    # Lemmatizing the words in every sentence and creaing a word index dictionay
    # This Dictionary will maps numeric index values to words which can pe used later in the model 
    for word in tok_sentence:
        #word = lemma_function.lemmatize(word)
        if word not in word_index:
            word_index[word]=idx
            idx+=1
print("Done")
print("Found %d words in the dataset" %len(word_index))

Tokenize and creating word index....
Done
Found 20896 words in the dataset


In [135]:
def flatten(l):
    r_l = []
    for i in l:
        if len(i)>0:
            temp = i.replace("<br />", "")
            r_l.append(temp.lower())
    return r_l

def count(sentences):
    f_sentences = flatten(sentences)
    counts = Counter()
    for i in f_sentences:
        counts.update(re.findall('\w+',i))
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    return counts
c = count(sentences)
    

In [136]:
c

[('the', 13430),
 ('and', 6474),
 ('a', 6382),
 ('of', 5910),
 ('to', 5316),
 ('is', 4069),
 ('in', 3727),
 ('it', 3667),
 ('i', 3592),
 ('this', 3102),
 ('that', 2731),
 ('s', 2394),
 ('was', 1928),
 ('as', 1797),
 ('movie', 1787),
 ('with', 1773),
 ('for', 1719),
 ('but', 1638),
 ('film', 1559),
 ('you', 1535),
 ('on', 1352),
 ('t', 1296),
 ('not', 1232),
 ('his', 1187),
 ('are', 1176),
 ('he', 1120),
 ('have', 1084),
 ('be', 1011),
 ('one', 1007),
 ('all', 979),
 ('at', 966),
 ('by', 858),
 ('they', 857),
 ('who', 850),
 ('an', 846),
 ('from', 824),
 ('so', 819),
 ('like', 804),
 ('there', 770),
 ('just', 758),
 ('about', 720),
 ('or', 697),
 ('out', 673),
 ('if', 669),
 ('what', 639),
 ('has', 620),
 ('her', 605),
 ('can', 580),
 ('some', 580),
 ('very', 550),
 ('more', 545),
 ('no', 543),
 ('which', 540),
 ('when', 540),
 ('good', 540),
 ('even', 527),
 ('see', 525),
 ('up', 514),
 ('their', 510),
 ('would', 497),
 ('my', 492),
 ('she', 479),
 ('me', 475),
 ('story', 462),
 ('time

# Skip-Gram Dataset Creation

In [81]:
CONTEXT_SIZE = 2 
EMB_DIM = 50



In [137]:
def index_lookup(word, w_i = word_index):
    return w_i[word]

N = len(word_index)
input_target_pair = []

for sentence in filtered_sentence:
    tok_filtered_sentence = nt.word_tokenize(sentence.lower())
    for i, word in enumerate(tok_filtered_sentence):
        #print(word)
        for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
            try:
                if cont_ran!=0 and i+cont_ran>=0:
                    i_word = index_lookup(word)
                    i_neri = index_lookup( tok_filtered_sentence[i+cont_ran])
                    temp_w = to_categorical(i_word,N)
                    temp_n = to_categorical(i_neri,N)
                    temp = (torch.from_numpy(np.asarray(temp_w)), torch.from_numpy(np.asarray(temp_n)))
                    #print(temp)
                    input_target_pair.append(temp)   
            except IndexError:
                continue  
    
    
input_target_pair[:5],tok_filtered_sentence

MemoryError: Unable to allocate 81.6 KiB for an array with shape (1, 20896) and data type float32

In [144]:
def prob_frequency(count, t = 1e-5):
    prob = {}
    N = sum(x[1] for x in count)
    print (N)
    for k,v in count:
        p = (v/N)**0.75
        pk = 1-np.sqrt(t/p)
        prob[k] = pk
    return prob

def subsampling(probs,filtered_sentence):
    train = []
    for sentence in filtered_sentence:
        tok_filtered_sentence = nt.word_tokenize(sentence.lower())
        temp = []
        for word in tok_filtered_sentence:
            try:
                if random.random() < (1-probs[word]):
                    temp.append(word)
            except KeyError:
                continue
        train.append(temp)
    return train

def train_preprocess(sentence, CONTEXT_SIZE = CONTEXT_SIZE):
    input_target_pair = []
    for i, word in enumerate(sentence):
        #print(word)
        for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
            try:
                if cont_ran!=0 and i+cont_ran>=0:
                    print(word)
                    i_word = index_lookup(word)
                    i_neri = index_lookup(sentence[i+cont_ran])
                    temp_w = to_categorical(i_word,N)
                    temp_n = to_categorical(i_neri,N)
                    temp = (torch.from_numpy(np.asarray(temp_w)), torch.from_numpy(np.asarray(temp_n)))
                    #print(temp)
                    input_target_pair.append(temp)   
            except (IndexError or TypeError):
                continue
    return input_target_pair
probs = prob_frequency(c)
s = subsampling(probs, filtered_sentence)
s
#train_preprocess(s)
      
    

235382


[['exactly',
  'struck',
  'oz',
  'brutality',
  'punches',
  'to',
  'hardcore',
  'use',
  'oz',
  'given',
  'maximum',
  'security',
  'state',
  'it',
  'focuses',
  'section',
  'face',
  'so',
  'privacy',
  'em',
  'italians',
  'scuffles',
  'shady',
  'painted',
  'surreal',
  'taste',
  'guards',
  'nickel',
  'on',
  'order',
  'mannered',
  'inmates',
  'is',
  'viewing',
  'thats'],
 ['entire',
  'polari',
  'by',
  'but',
  'comedy',
  'fantasy',
  'knowledge',
  'senses',
  'sets',
  'flat',
  'halliwell',
  'murals'],
 ['witty',
  'likable',
  'match',
  'control',
  'years',
  'dare',
  'decade',
  'young',
  'jewel',
  'devil',
  'wears'],
 ['boy',
  'thinks',
  'jake',
  'arguing',
  'totally',
  'similar',
  'playing',
  'dialogs'],
 ['time',
  'vivid',
  'portrait',
  'and',
  'different',
  'encounter',
  'being',
  'all',
  'and',
  'or',
  'own',
  'the',
  'sincere',
  'rosario',
  'dawson',
  'imperioli',
  'mattei',
  'anxiously'],
 ['selflessness',
  'nobl

# Skip Gram Model

In [106]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.u_embeddings = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.v_embeddings = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.init_emb()
        
    def init_emb(self):
        """Initialize embedding weight like word2vec.
            The u_embedding is a uniform distribution in [-0.5/em_size, 0.5/emb_size], and the elements of v_embedding are zeroes.
            Returns:None
        """
        initrange = 0.5 / self.emb_size
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
        #print(self.u_embeddings.weight.data.uniform_(-initrange, initrange))

In [107]:
trainloader = torch.utils.data.DataLoader(input_target_pair, batch_size=10, shuffle=True)
SkipGram(N,EMB_DIM)

SkipGram(
  (u_embeddings): Embedding(5007, 50, sparse=True)
  (v_embeddings): Embedding(5007, 50, sparse=True)
)