In [None]:
import progressbar
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
%matplotlib inline



In [None]:
CONTEXT_SIZE = 10
EMBEDDING_DIM = 200
EPOCHS = 5

# Reading a file

In [None]:
filename = "your_file.txt"
with open(filename, "rb") as file:
        raw_text = file.read().decode("utf-8").strip()
#raw_text

function for reading a file and converting into list of tokens

In [None]:
def read_file(file_name):
    with open(filename, "rb") as file:
        processed_text = word_tokenize(file.read().decode("utf-8").strip())
    return processed_text
processed_text = read_file(filename)
#processed_text

## preprocessing

In [None]:
def mini_processing(processed_text):
    vocab = {}
    ix_to_word = {}
    word_to_ix = {}
    total = 0.0
    for word in processed_text:
        if word not in vocab:
            vocab[word] = 0
            ix_to_word[len(word_to_ix)] = word
            word_to_ix[word] = len(word_to_ix)
        vocab[word] += 1.0
        total += 1.0
    return vocab, ix_to_word, word_to_ix
vocab,ix_to_word,word_to_ix = mini_processing(processed_text)
#print("Vocaublary:", vocab)
#print("ix_to_word",ix_to_word)
#print("word_to_ix",word_to_ix)

# generating training data

In [None]:
vocab

In [None]:

def gather_training_data(processed_text, word_to_ix, context_size):
    training_data = []
    for i, word in enumerate(processed_text):
        back_i = i - 1
        back_c = 0
        forw_i = i + 1
        forw_c = 0
        while (back_i >= 0 and back_c < context_size):
            training_data.append(([word_to_ix[word]], word_to_ix[processed_text[back_i]]))
            back_i -= 1
            back_c += 1
        while (forw_i < len(processed_text) and forw_c < context_size):
            training_data.append(([word_to_ix[word]], word_to_ix[processed_text[forw_i]]))
            forw_i += 1
            forw_c += 1
    return training_data
training_data = gather_training_data(processed_text, word_to_ix, CONTEXT_SIZE)
#training_data

# Model

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.linear(embeds)
        log_probs = F.log_softmax(out)
        return log_probs

In [None]:
model = SkipGram(len(vocab),EMBEDDING_DIM)


# Loss and optimization


In [None]:
model

In [None]:
class NEGLoss(nn.Module):
    def __init__(self, ix_to_word, word_freqs, num_negative_samples=5,):
        super(NEGLoss, self).__init__()
        self.num_negative_samples = num_negative_samples
        self.num_words = len(ix_to_word)
        self.distr = F.normalize(torch.Tensor(
            [word_freqs[ix_to_word[i]] for i in range(len(word_freqs))]).pow(0.75), dim=0
        )

    def sample(self, num_samples, positives=[]):
        weights = torch.zeros((self.num_words, 1))
        for w in positives: weights[w] += 1.0
        for _ in range(num_samples):
            w = torch.multinomial(self.distr, 1)[0]
            while (w in positives):
                w = torch.multinomial(self.distr, 1)[0]
            weights[w] += 1.0
        return weights

    def forward(self, input, target):
        #print("target= ",target)
        #print("sample", self.sample(self.num_negative_samples, positives=target.data.numpy()))
        #print("input = ",input)
        #print("\n")
        return F.nll_loss(input, target,
            self.sample(self.num_negative_samples, positives=target.data.numpy()))

In [None]:
loss_function = NEGLoss(ix_to_word,vocab)
optimizer = optim.SGD(model.parameters(), lr=0.001)

# TRAINING 

In [None]:
losses=[]
for epoch in range(EPOCHS):
    total_loss = torch.Tensor([0])
    print("Beginning epoch %d" % epoch)
    progress_bar = progressbar.ProgressBar()
    for context, target in progress_bar(training_data):
        context_var = autograd.Variable(torch.LongTensor(context))
        model.zero_grad()
        log_probs = model(context_var)
        #print("target",target)
        #print("log_probs",log_probs)
        #print('\n')
        loss = loss_function(log_probs, autograd.Variable(
            torch.LongTensor([target])))
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    print("Epoch %d Loss: %.5f" % (epoch, total_loss[0]))
    losses.append(total_loss)

In [None]:
indices = np.random.choice(np.arange(len(vocab)), size=100, replace=True)

## The comments are there to make obvious the pairs.

#syns = [('chand','kuch'),('dhool','matti'),('australia','international'),('baarisho','water'),('workers','mazdoor'),
 #    ('diye','paish'),('mehnat','taraqqi'),('chairman','chancellor'),('officer','chairman'),('maqam','elaqon'),('senetor',
    #'wazir'), ('zimmay','akhrajaat'),('kyunki','kyunkay'),('hamaray','hum'),('gaind','baal'),('London','Bartanwi')]

syns = ['chand','kuch','dhool','matti','Australia','international','baarisho','water','workers','mazdoor',
     'diye','paish','mehnat','taraqqi','chairman','chancellor','officer','chairman','maqam','elaqon','senetor','wazir',
       'zimmay','akhrajaat','kyunki','kyunkay','hamaray','hum','gaind','baal','London','Bartanwi']

#analogies = [('australia','international'),('water','atlantic'),('mayor','belgium'),('university','oxford'),
#             ('darkhwast','article'),('pakhton','Afghan'),('relief','water'),('mumalik','belgium'),('mayor','himayat'),
#             ('college','principal'),('news','ghalat'),('naam','Ahmed'),('wet','water')]

analogies = ['Australia','international','water','atlantic','mayor','belgium','university','oxford',
             'darkhwast','article','pakhton','Afghan','relief','water','mumalik','belgium','mayor','himayat',
             'college','principal','naam','Ahmed']

#misspelled = [('lout','out'),('elaqon','ilaqon'),('Diasel','diesel'),('kyunki','kyunkay'),('neh','nay','ne'),('hai','hain')]

misspelled =['lout','out','elaqon','ilaqon','Diasel','diesel','kyunki','kyunkay','neh','nay','ne','hai','hain']


#Change the variable 'misspelled' to 'analogies' or 'syns'

for i in analogies:
    word = i
    input = autograd.Variable(torch.LongTensor([word_to_ix[word]]))
    vec = model.embeddings(input).data[0]
    x, y = vec[0], vec[1]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2),textcoords='offset points', ha='right', va='bottom')
    plt.savefig("graph.png")
    
plt.show()