In [None]:
import numpy as np
from collections import defaultdict
np.random.seed(1)
NUM = "NNNUMMM"

def readFile(fstream):
    '''
    Reads the training data
    '''
    ret = []

    current_toks= []
    for line in fstream:
        line = line.strip()
        if len(line) == 0 or line.startswith("-DOCSTART-"):
            if len(current_toks) > 0:
                ret.extend(current_toks)
            current_toks = []
        else:
            arr=line.split()
            tok = arr[0]
            if tok.isdigit():
                tok = NUM
            current_toks.append(tok)
    if len(current_toks) > 0:
        ret.extend(current_toks)
    return " ".join(ret)

text = readFile(open('train.txt','r'))
corpus = [[word.lower() for word in text.split()]]


settings = {'window_size': 1, 'n': 50, 'epochs': 20, 'learning_rate': 0.01 }
# window_size > 1 takes more training time


class word2vec():
    
  def __init__(self):
    self.n = settings['n']
    self.lr = settings['learning_rate']
    self.epochs = settings['epochs']
    self.window = settings['window_size']

  def generate_training_data(self, settings, corpus):
    '''
    Finds unique word counts using dictonary
    '''
    word_counts = defaultdict(int)
    for row in corpus:
      for word in row:
        word_counts[word] += 1
    self.v_count = len(word_counts.keys())
    self.words_list = list(word_counts.keys())
    self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
    self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

    training_data = []
    for sentence in corpus:
      sent_len = len(sentence)
      for i, word in enumerate(sentence):
        w_target = self.word2onehot(sentence[i])
        w_context = []
        for j in range(i - self.window, i + self.window+1):
          if j != i and j <= sent_len-1 and j >= 0:
            w_context.append(self.word2onehot(sentence[j]))
        training_data.append([w_target, w_context])
    return np.array(training_data)

  def word2onehot(self, word):
    '''
    Returns one hot encoding
    '''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

  def train(self, training_data):
    '''
    Trains and returns the weights
    '''
    self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
    self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
    for i in range(self.epochs):
      self.loss = 0
      for w_t, w_c in training_data:
        y_pred, h, u = self.forward_pass(w_t)
        EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

        self.backprop(EI, h, w_t)
        self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
        
    return self.w1,self.w2 
        
  def forward_pass(self, x):
    '''
    Peforms the forward pass
    '''
    h = np.dot(self.w1.T, x)
    u = np.dot(self.w2.T, h)
    y_c = self.softmax(u)
    return y_c, h, u
  
  def softmax(self, x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)
     
  def backprop(self, e, h, x):
    '''
    Performs backpropagation
    '''
    dl_dw2 = np.outer(h, e)
    dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
    self.w1 = self.w1 - (self.lr * dl_dw1)
    self.w2 = self.w2 - (self.lr * dl_dw2)

  def word_vec(self, word):
    '''
    Returns the vector corresponding to the word
    '''
    w_index = self.word_index[word]
    v_w = self.w1[w_index]
    return v_w


w2v = word2vec()
training_data = w2v.generate_training_data(settings, corpus)
w1, w2 = w2v.train(training_data)