In [0]:
from __future__ import division
import argparse
import pandas as pd
import spacy as sp
from tqdm import tqdm
# useful stuff
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import normalize

In [0]:
nlp = sp.load("en_core_web_sm")

In [0]:
def text2sentences(path):
    sentences = []
    string=''
    with open(path,encoding="utf8") as f:
        content=f.read()
        docs_raw = content.splitlines()
        for l in tqdm(docs_raw):
            x=nlp(l.lower())
            string_tokens = [token.orth_ for token in x if not token.is_punct]
            sentences.append(string_tokens)
        return sentences

In [0]:

path='/content/news.en-00001-of-00100'
sentences = text2sentences(path)

100%|██████████| 15128/15128 [02:24<00:00, 104.92it/s]


In [0]:
sentences = sentences[:5000]

In [0]:
class SkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize = 5, minCount = 5):
        self.vocab={}
        for line in sentences:
          for word in line:
            if word not in self.vocab:
                self.vocab[word]=1
        self.w2id = dict((i,word) for word,i in enumerate(self.vocab))
        self.trainset=dict((i,line) for i,line in enumerate(sentences))
        self.weight_1=np.random.uniform(-1,1,(len(self.vocab.keys()),nEmbed))
        self.weight_2=np.random.uniform(-1,1,(nEmbed,len(self.vocab.keys())))
        self.error = 0
        self.train() 
        
    def onehotcode(self,word):
      word_onehot= []
      count=len(self.vocab.keys())
      vec=np.zeros(count)
      pos=self.w2id[word]
      vec[pos]=1
      return vec

    def train(self):
        for counter,sentence in tqdm(self.trainset.items()):
            sentence = list(filter(lambda word: word in self.vocab, sentence))#check if all words of sentence are in vocab
            for wpos,word in tqdm(enumerate(sentence)):
                
                wIdx = self.w2id[word]
                winsize = 2 
                start = max(0, wpos - winsize)
                end = min(wpos + winsize + 1, len(sentence))
                word_vec=self.onehotcode(word)
                context_vec=[]
                train_vec=[]
                
                for context_word in sentence[start:end]: 
                    ctxtId = self.w2id[context_word]
                    if ctxtId == wIdx: continue
                    context_vec.append(self.onehotcode(context_word))
                train_vec.append([word_vec,context_vec])               
                self.trainWord(train_vec)#call here
    
    def trainWord(self, train_vec):
        for i in (range(5)):
            self.error = 0
            for word,context in train_vec:
                pred,h,o=self.forward(word)
                for contextvec in context:
                    subarray=pred-contextvec
                    self.error+=subarray
                self.backprop(h,word,self.error)
        
                    
    def backprop(self,h,word,error):
        up1=np.outer(h,error)
        up2=np.outer(word,np.dot(self.weight_2,error.T))
        self.weight_1=self.weight_1-(0.2*up2)
        self.weight_2=self.weight_2-(0.2*up1)
                
    def similarity(self,word1,word2):
        vec1=self.weight_1[self.w2id[word1]]
        vec2=self.weight_1[self.w2id[word2]]
        vec_sum=np.dot(vec1,vec2)
        vec_norm=np.linalg.norm(vec1)*np.linalg.norm(vec2)
        cosine_dist=vec_sum/vec_norm
        return(cosine_dist)                                     
                              
    def forward(self,w):
        hidden=np.dot(self.weight_1.T,w)
        output=np.dot(self.weight_2.T,hidden)
        pred=self.softmax(output)
        return(pred,hidden,output)
        
        
    def softmax(self,x):
        z=np.exp(x-np.max(x))
        return(z/z.sum())                
    

In [0]:
sg = SkipGram(sentences)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m





24it [00:02,  8.74it/s][A[A[A[A[A[A





25it [00:02,  8.74it/s][A[A[A[A[A[A





26it [00:02,  8.66it/s][A[A[A[A[A[A





27it [00:03,  8.65it/s][A[A[A[A[A[A





28it [00:03,  8.43it/s][A[A[A[A[A[A





29it [00:03,  8.47it/s][A[A[A[A[A[A





30it [00:03,  8.50it/s][A[A[A[A[A[A





31it [00:03,  8.64it/s][A[A[A[A[A[A





32it [00:03,  8.77it/s][A[A[A[A[A[A





33it [00:03,  8.90it/s][A[A[A[A[A[A





34it [00:03,  8.93it/s][A[A[A[A[A[A





35it [00:03,  9.02it/s][A[A[A[A[A[A





36it [00:04,  9.01it/s][A[A[A[A[A[A





37it [00:04,  8.84it/s][A[A[A[A[A[A





[A[A[A[A[A[A




 15%|█▌        | 758/5000 [31:29<3:03:16,  2.59s/it][A[A[A[A[A





0it [00:00, ?it/s][A[A[A[A[A[A





1it [00:00,  8.92it/s][A[A[A[A[A[A





2it [00:00,  8.96it/s][A[A[A[A[A[A





3it [00:00,  8.99it/s][A[A[A[A[A[A



AttributeError: ignored