In [1]:
import pandas as pd 
import numpy as np
import nltk.tokenize as nt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical  
from collections import Counter, OrderedDict
import re
from itertools import chain
import torch
import torch.nn as nn
import math
import random
from torch.optim import SGD,Adam
from torch.autograd import Variable
from collections import defaultdict
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize

Using TensorFlow backend.


In [2]:
'''
Preprocessing mobydick text to generate data and pass it to a skipgram model
'''
# this pasrt is only requied once to convert the downloaded text to a format that can be used for the model
strings = ""
with open("mobydick.txt", encoding='utf-8') as file:
    for line in file:
        for l in re.split(r"(\. |\? |\! )",line):
            strings += l
        strings += '\n'
s = strings.replace('\n','')
c = sent_tokenize(s)

ff = open("mobydicktrim.txt", "w", encoding='utf-8')
for i in c:
    #print(i)
    if (len(i.split())>1):
        s = re.sub(r'[^\w\s]',' ',i)
        s = s.replace("_","")
        ff.write(s.lower())
        ff.write('\n')
ff.close()    

# Hyperparameters

In [3]:
# Hyperparameters 
thresh = float=1e-5
TABLE_SIZE = 1e8
alpha = 0.75
CONTEXT_SIZE = 2 
EMB_DIM = 50
lr = 0.003
batch_size = 512

# Data Preprocessing

In [5]:
idx = 0 
word_index = {}
index_word = {}
vocab = defaultdict(int)
vocab_freq = {}
total_words = 0
min_count = 5
sentence = []

f = open('mobydicktrim.txt', encoding='utf-8')
for i in f.readlines():
    tok_sentence =  i.strip().split()
    sentence.append(tok_sentence)
    for word in tok_sentence:
        vocab[word] += 1
        total_words += 1
# get word to index, index to word and frequncy of the words
for k,v in vocab.items():
    vocab_freq[k] = v
    word_index[k]=idx
    index_word[idx] = k
    idx+=1
print("Done")
print("Found %d words in the dataset" %len(word_index))
print("Found %d index in the dataset" %len(index_word))

N = len(vocab_freq)
total_words

Done
Found 26859 words in the dataset
Found 26859 index in the dataset


200821

# Sub-Sampling the data

In [6]:
'''
Removing frequent occuring words 
'''
def sub_sampling(freq, thresh = thresh):
    return 1 - ((thresh/freq)**0.5 + (thresh/freq))

freq = {k : v/total_words for k,v in vocab_freq.items()}
freq = {k : sub_sampling(v) for k,v in freq.items()}
train = []
c=0
for s in sentence:
    temp = []
    for i in s:
        if (random.random() < v):
            temp.append(i)
            c+=1
    train.append(temp)

# Negative Subsamples

In [7]:
def negs(freq,alpha = alpha, TABLE_SIZE=TABLE_SIZE):
    '''
    Calculates the probaility of the negative subsampled words and generates a unigram table based on these probs 
    '''
    unigram_table = []
    pow_freq = np.array(list(freq.values())) ** alpha
    tot_pow = sum(pow_freq)
    r = pow_freq / tot_pow
    count = np.round(r * TABLE_SIZE)
    print(count)
    for item_id, _count in enumerate(count):
        unigram_table += [item_id] * int(_count)
    unigram_table = np.array(unigram_table)
    return unigram_table

def get_neg_samples(unigram_table, k=5, TABLE_SIZE=TABLE_SIZE):
    '''
    fetches random negative samples from unigram table
    '''
    rand = random.choices(range(TABLE_SIZE), k=k)
    neg_list =[]
    for i in rand:
        neg_list.append(unigram_table[i])
    return neg_list
n = negs(vocab_freq)

[62208.  3039.  1333. ...  1333.  1333.  1333.]


# Create training data

In [8]:
def index_lookup(word, w_i = word_index):
    '''
    returns index based on word 
    '''
    return w_i[word]

def word_lookup(idx, i_w = index_word):
    '''
     returns the word based on index
    '''
    return i_w[idx]

def train_preprocess(sentence,unigram_table,K,TABLE_SIZE=int(TABLE_SIZE),CONTEXT_SIZE = CONTEXT_SIZE):
    '''
        generates word and target pairs with 5 negative subsampled values which is fed to the model 
    '''
    input_target_pair = []
    neg_v = []
    for  s in sentence:
        for i, word in enumerate(s):
            for cont_ran in range(-CONTEXT_SIZE,CONTEXT_SIZE+1):
                try:
                    if cont_ran!=0 and i+cont_ran>=0:
                        i_word = index_lookup(word)
                        i_target = index_lookup(s[i+cont_ran])
                        negs = get_neg_samples(unigram_table, K, TABLE_SIZE)
                        temp = (i_word, i_target, np.array(negs, dtype=np.int64))
                        input_target_pair.append(temp)   
                except (IndexError or TypeError):
                    continue
    return input_target_pair
tr = train_preprocess(train, n, 5)
tr[:10]

[(0, 1, array([9518, 2280,   39,  684,  905], dtype=int64)),
 (1, 0, array([1905, 4666, 2122,   39,   44], dtype=int64)),
 (2, 3, array([ 2001, 24685,   159, 13723, 26317], dtype=int64)),
 (2, 4, array([ 8654, 16336,    98,  3591, 10321], dtype=int64)),
 (3, 2, array([ 8033,  1639, 19284,  3673,  2001], dtype=int64)),
 (3, 4, array([2191,  455, 1273, 7853,  243], dtype=int64)),
 (3, 5, array([ 3158,   159,    39,    34, 10938], dtype=int64)),
 (4, 2, array([   19,    41,   359, 14882, 18089], dtype=int64)),
 (4, 3, array([11204, 10744, 20171,   138,  2167], dtype=int64)),
 (4, 5, array([26818, 21116,  3689, 16172,  8274], dtype=int64))]

In [9]:
# loads the preprocessed training data into batches 
trainloader = torch.utils.data.DataLoader(tr, batch_size=batch_size)

# Skip-Gram model

In [10]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.in_embeddings = nn.Embedding(vocab_size, emb_size,sparse=True)
        self.out_embeddings = nn.Embedding(vocab_size, emb_size,sparse=True)
        # initalize wmbedding weights
        initrange = 0.5 / self.emb_size
        self.in_embeddings.weight.data.uniform_(-initrange, initrange)
        self.out_embeddings.weight.data.uniform_(0,0)
        
    def forward(self,word,target,negs):
        u = self.in_embeddings(word)
        v = self.out_embeddings(target)
        pos_vals = torch.mul(u,v).squeeze()
        pos_vals = torch.sum(pos_vals, dim=1)
        pos_vals = F.logsigmoid(pos_vals)
        v_hat = self.out_embeddings(negs)               
        neg_vals =  torch.bmm(v_hat, u.unsqueeze(2)).squeeze()
        neg_vals = F.logsigmoid(-neg_vals)
        l = -(torch.sum(pos_vals) + torch.sum(neg_vals))
        return l

In [13]:
def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """
    
    # Here we're calculating the cosine similarity between some random words and 
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.
    
    embed_vectors = embedding.weight
    
    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples,
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)
    
    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
        
    return valid_examples, similarities

In [14]:
net = SkipGram(N,EMB_DIM)
optimizer = SGD(net.parameters(), lr=lr)
net = net.cuda()

# Train the model

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for epoch in range(10):  # loop over the dataset multiple times
    avg_loss = 0
    j = 0
    for i, data in enumerate(trainloader):
        inputs, labels, negs = data
        inputs = Variable(torch.LongTensor(inputs))
        labels = Variable(torch.LongTensor(labels))
        negs = Variable(torch.LongTensor(negs))
        inputs = inputs.cuda()
        labels = labels.cuda()
        negs = negs.cuda()
        loss = net(inputs,labels,negs)
        # zero the parameter gradients 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
        # print statistics
        if i % 1400 == 1399:    # print every 1400 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, avg_loss/(1399*batch_size)))
            valid_examples, valid_similarities = cosine_similarity(net.in_embeddings, device=device)
            _, closest_idxs = valid_similarities.topk(6)
            avg_loss = 0.0
            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [word_lookup(idx.item()) for idx in closest_idxs[ii]][1:]
                print(word_lookup(valid_idx.item()) + " | " + ', '.join(closest_words))
            print("...\n")
print('Finished Training')

[1,  1400] loss: 4.127
warehouses | pressure, heigh, whereonthey, uncomfortableinfliction, insure
upper | iron, way, ve, even, world
especially | large, play, might, oil, sort
watery | next, involving, pictorial, used, heat
that | he, as, so, but, s
would | have, it, are, i, not
before | as, s, but, like, and
strong | drops, main, iclaim, november, memorable
weary | crew, stood, aloft, sound, standing
brought | nor, off, leviathan, harpoon, last
windows | higherthan, mingled, harbourless, intervals, rapacious
ofa | stress, monstrouscabinet, swordbetween, transferring, fair
pieces | beat, skeleton, waters, view, heart
wisdom | depth, breathest, gardenny, thedantean, aresome
boots | pipe, frombuffalo, thickens, mindfor, avalor
inches | intervening, weare, witharsacidean, hearses, leaping
...

[2,  1400] loss: 3.791
years | then, can, or, stubb, who
it | some, this, their, him, an
knocking | readily, dumb, vital, drawn, thedeck
andmethodically | wideintervals, ofqueens, symphony, arepecki