In [1]:
import pandas as pd 
import numpy as np
import re
from nltk.tokenize import word_tokenize
from collections import defaultdict
import random
from ast import literal_eval

# Pre-coded

In [16]:
class SkipGramNumpy():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self, settings, corpus):
        # GENERATE WORD COUNTS
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
        self.v_count = len(word_counts.keys())

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)

            # CYCLE THROUGH EACH WORD IN SENTENCE
            for i, word in enumerate(sentence):
                
                #w_target  = sentence[i]
                w_target = self.word2onehot(sentence[i])

                # CYCLE THROUGH CONTEXT WINDOW
                w_context = []
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0:
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target, w_context])
        return np.array(training_data)

    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec


    # FORWARD PASS
    def forward_pass(self, x):
        #### FILL THE BLANK(S) ####
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = softmax(u)
        return y_c, h, u
        ############## END
                

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        ############## YOUR CODE HERE
        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        ############## END


    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:
                
                #### FILL THE BLANK(S) ####             
                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                
                ############## END
            print('EPOCH:',i, 'LOSS:', self.loss)


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w


    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            
            #### Cosine distance
            v_w2 = self.w1[i]
            cosine_sim = cosine_similarity(v_w1, v_w2)
            
            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = [(k,v) for k, v in sorted(word_sim.items(), key=lambda item: item[1], reverse=True)]
        for word, sim in words_sorted[:top_n]:
            print(word, sim)

# To fill in 

In [18]:
settings = {}
settings['n'] = 5                   # dimension of word embeddings
settings['window_size'] = 2         # context window +/- center word
settings['min_count'] = 0           # minimum word count
settings['epochs'] = 5000           # number of training epochs
settings['neg_samp'] = 10           # number of negative words to use during training
settings['learning_rate'] = 0.01    # learning rate
np.random.seed(0)                   # set the seed for reproducibility

corpus = [["learning", "how", "to" "code", "word2vec", "from", "scratch", "is", "fun"]]

# INITIALIZE W2V MODEL
w2v = SkipGramNumpy()

# generate training data
training_data = w2v.generate_training_data(settings, corpus)

# train word2vec model
w2v.train(training_data)

EPOCH: 0 LOSS: 56.10975693201351
EPOCH: 1 LOSS: 55.64365621693948
EPOCH: 2 LOSS: 55.20063301855599
EPOCH: 3 LOSS: 54.77855531413216
EPOCH: 4 LOSS: 54.37554041060348
EPOCH: 5 LOSS: 53.98992111137397
EPOCH: 6 LOSS: 53.620217186485725
EPOCH: 7 LOSS: 53.26511120910848
EPOCH: 8 LOSS: 52.923428001962314
EPOCH: 9 LOSS: 52.59411708071632
EPOCH: 10 LOSS: 52.276237595669215
EPOCH: 11 LOSS: 51.96894536437658
EPOCH: 12 LOSS: 51.671481661210905
EPOCH: 13 LOSS: 51.383163488909744
EPOCH: 14 LOSS: 51.10337510493809
EPOCH: 15 LOSS: 50.83156061427385
EPOCH: 16 LOSS: 50.56721747183114
EPOCH: 17 LOSS: 50.30989076359163
EPOCH: 18 LOSS: 50.059168156743674
EPOCH: 19 LOSS: 49.814675426625556
EPOCH: 20 LOSS: 49.57607248274033
EPOCH: 21 LOSS: 49.34304982812193
EPOCH: 22 LOSS: 49.11532539633568
EPOCH: 23 LOSS: 48.89264171875748
EPOCH: 24 LOSS: 48.67476338178312
EPOCH: 25 LOSS: 48.46147473951138
EPOCH: 26 LOSS: 48.25257785241188
EPOCH: 27 LOSS: 48.04789062668627
EPOCH: 28 LOSS: 47.847245132586195
EPOCH: 29 LOSS: 

EPOCH: 297 LOSS: 32.45476058848301
EPOCH: 298 LOSS: 32.44927297296244
EPOCH: 299 LOSS: 32.44384103701361
EPOCH: 300 LOSS: 32.43846401114689
EPOCH: 301 LOSS: 32.43314113911417
EPOCH: 302 LOSS: 32.42787167763514
EPOCH: 303 LOSS: 32.422654896130595
EPOCH: 304 LOSS: 32.417490076462244
EPOCH: 305 LOSS: 32.41237651267893
EPOCH: 306 LOSS: 32.40731351076914
EPOCH: 307 LOSS: 32.402300388419434
EPOCH: 308 LOSS: 32.39733647477886
EPOCH: 309 LOSS: 32.39242111022891
EPOCH: 310 LOSS: 32.38755364615911
EPOCH: 311 LOSS: 32.38273344474783
EPOCH: 312 LOSS: 32.37795987874841
EPOCH: 313 LOSS: 32.373232331280235
EPOCH: 314 LOSS: 32.36855019562472
EPOCH: 315 LOSS: 32.36391287502612
EPOCH: 316 LOSS: 32.35931978249685
EPOCH: 317 LOSS: 32.35477034062738
EPOCH: 318 LOSS: 32.35026398140048
EPOCH: 319 LOSS: 32.34580014600967
EPOCH: 320 LOSS: 32.34137828468188
EPOCH: 321 LOSS: 32.336997856503984
EPOCH: 322 LOSS: 32.33265832925338
EPOCH: 323 LOSS: 32.32835917923231
EPOCH: 324 LOSS: 32.32409989110586
EPOCH: 325 LOSS

EPOCH: 545 LOSS: 31.873917584828007
EPOCH: 546 LOSS: 31.872976679805987
EPOCH: 547 LOSS: 31.872040536577693
EPOCH: 548 LOSS: 31.871109121051866
EPOCH: 549 LOSS: 31.87018239945051
EPOCH: 550 LOSS: 31.869260338305338
EPOCH: 551 LOSS: 31.868342904454337
EPOCH: 552 LOSS: 31.86743006503837
EPOCH: 553 LOSS: 31.86652178749783
EPOCH: 554 LOSS: 31.865618039569263
EPOCH: 555 LOSS: 31.864718789282165
EPOCH: 556 LOSS: 31.863824004955717
EPOCH: 557 LOSS: 31.86293365519557
EPOCH: 558 LOSS: 31.862047708890778
EPOCH: 559 LOSS: 31.861166135210595
EPOCH: 560 LOSS: 31.860288903601486
EPOCH: 561 LOSS: 31.85941598378401
EPOCH: 562 LOSS: 31.85854734574994
EPOCH: 563 LOSS: 31.85768295975923
EPOCH: 564 LOSS: 31.856822796337113
EPOCH: 565 LOSS: 31.85596682627125
EPOCH: 566 LOSS: 31.855115020608913
EPOCH: 567 LOSS: 31.854267350654077
EPOCH: 568 LOSS: 31.853423787964765
EPOCH: 569 LOSS: 31.852584304350273
EPOCH: 570 LOSS: 31.851748871868452
EPOCH: 571 LOSS: 31.85091746282307
EPOCH: 572 LOSS: 31.85009004976117
EP

EPOCH: 818 LOSS: 31.7236365990767
EPOCH: 819 LOSS: 31.723320534430165
EPOCH: 820 LOSS: 31.72300547632412
EPOCH: 821 LOSS: 31.72269142017162
EPOCH: 822 LOSS: 31.722378361412765
EPOCH: 823 LOSS: 31.722066295514594
EPOCH: 824 LOSS: 31.721755217970756
EPOCH: 825 LOSS: 31.72144512430142
EPOCH: 826 LOSS: 31.72113601005305
EPOCH: 827 LOSS: 31.720827870798225
EPOCH: 828 LOSS: 31.72052070213542
EPOCH: 829 LOSS: 31.720214499688886
EPOCH: 830 LOSS: 31.719909259108377
EPOCH: 831 LOSS: 31.71960497606906
EPOCH: 832 LOSS: 31.719301646271266
EPOCH: 833 LOSS: 31.718999265440324
EPOCH: 834 LOSS: 31.71869782932642
EPOCH: 835 LOSS: 31.718397333704374
EPOCH: 836 LOSS: 31.718097774373483
EPOCH: 837 LOSS: 31.717799147157372
EPOCH: 838 LOSS: 31.717501447903764
EPOCH: 839 LOSS: 31.71720467248437
EPOCH: 840 LOSS: 31.716908816794692
EPOCH: 841 LOSS: 31.716613876753833
EPOCH: 842 LOSS: 31.716319848304376
EPOCH: 843 LOSS: 31.71602672741218
EPOCH: 844 LOSS: 31.715734510066255
EPOCH: 845 LOSS: 31.715443192278574
EPO

EPOCH: 1161 LOSS: 31.653956550701597
EPOCH: 1162 LOSS: 31.653827408513333
EPOCH: 1163 LOSS: 31.653698544575793
EPOCH: 1164 LOSS: 31.653569958025706
EPOCH: 1165 LOSS: 31.653441648003266
EPOCH: 1166 LOSS: 31.653313613652188
EPOCH: 1167 LOSS: 31.65318585411957
EPOCH: 1168 LOSS: 31.653058368556003
EPOCH: 1169 LOSS: 31.652931156115454
EPOCH: 1170 LOSS: 31.6528042159553
EPOCH: 1171 LOSS: 31.6526775472363
EPOCH: 1172 LOSS: 31.652551149122623
EPOCH: 1173 LOSS: 31.652425020781706
EPOCH: 1174 LOSS: 31.652299161384377
EPOCH: 1175 LOSS: 31.65217357010477
EPOCH: 1176 LOSS: 31.652048246120298
EPOCH: 1177 LOSS: 31.651923188611697
EPOCH: 1178 LOSS: 31.651798396762935
EPOCH: 1179 LOSS: 31.651673869761254
EPOCH: 1180 LOSS: 31.651549606797115
EPOCH: 1181 LOSS: 31.651425607064205
EPOCH: 1182 LOSS: 31.651301869759433
EPOCH: 1183 LOSS: 31.65117839408288
EPOCH: 1184 LOSS: 31.65105517923778
EPOCH: 1185 LOSS: 31.65093222443059
EPOCH: 1186 LOSS: 31.650809528870823
EPOCH: 1187 LOSS: 31.650687091771186
EPOCH: 118

EPOCH: 1430 LOSS: 31.627065927780627
EPOCH: 1431 LOSS: 31.626988892400455
EPOCH: 1432 LOSS: 31.62691198907549
EPOCH: 1433 LOSS: 31.626835217478913
EPOCH: 1434 LOSS: 31.62675857728489
EPOCH: 1435 LOSS: 31.626682068168716
EPOCH: 1436 LOSS: 31.626605689806674
EPOCH: 1437 LOSS: 31.62652944187611
EPOCH: 1438 LOSS: 31.626453324055404
EPOCH: 1439 LOSS: 31.626377336024
EPOCH: 1440 LOSS: 31.626301477462313
EPOCH: 1441 LOSS: 31.626225748051816
EPOCH: 1442 LOSS: 31.62615014747503
EPOCH: 1443 LOSS: 31.626074675415445
EPOCH: 1444 LOSS: 31.625999331557598
EPOCH: 1445 LOSS: 31.62592411558703
EPOCH: 1446 LOSS: 31.625849027190256
EPOCH: 1447 LOSS: 31.625774066054852
EPOCH: 1448 LOSS: 31.625699231869348
EPOCH: 1449 LOSS: 31.625624524323264
EPOCH: 1450 LOSS: 31.625549943107153
EPOCH: 1451 LOSS: 31.62547548791251
EPOCH: 1452 LOSS: 31.625401158431835
EPOCH: 1453 LOSS: 31.625326954358606
EPOCH: 1454 LOSS: 31.625252875387247
EPOCH: 1455 LOSS: 31.6251789212132
EPOCH: 1456 LOSS: 31.625105091532838
EPOCH: 1457 

EPOCH: 1728 LOSS: 31.608801295803673
EPOCH: 1729 LOSS: 31.608752699955836
EPOCH: 1730 LOSS: 31.608704171942215
EPOCH: 1731 LOSS: 31.608655711625822
EPOCH: 1732 LOSS: 31.60860731887002
EPOCH: 1733 LOSS: 31.60855899353853
EPOCH: 1734 LOSS: 31.608510735495436
EPOCH: 1735 LOSS: 31.608462544605175
EPOCH: 1736 LOSS: 31.608414420732558
EPOCH: 1737 LOSS: 31.608366363742718
EPOCH: 1738 LOSS: 31.60831837350116
EPOCH: 1739 LOSS: 31.60827044987377
EPOCH: 1740 LOSS: 31.60822259272673
EPOCH: 1741 LOSS: 31.60817480192663
EPOCH: 1742 LOSS: 31.608127077340356
EPOCH: 1743 LOSS: 31.608079418835196
EPOCH: 1744 LOSS: 31.608031826278747
EPOCH: 1745 LOSS: 31.60798429953895
EPOCH: 1746 LOSS: 31.607936838484118
EPOCH: 1747 LOSS: 31.6078894429829
EPOCH: 1748 LOSS: 31.60784211290428
EPOCH: 1749 LOSS: 31.607794848117585
EPOCH: 1750 LOSS: 31.60774764849249
EPOCH: 1751 LOSS: 31.607700513898997
EPOCH: 1752 LOSS: 31.607653444207457
EPOCH: 1753 LOSS: 31.607606439288567
EPOCH: 1754 LOSS: 31.60755949901333
EPOCH: 1755 L

EPOCH: 1993 LOSS: 31.597938146053387
EPOCH: 1994 LOSS: 31.597903645928174
EPOCH: 1995 LOSS: 31.59786918710625
EPOCH: 1996 LOSS: 31.597834769515966
EPOCH: 1997 LOSS: 31.597800393085855
EPOCH: 1998 LOSS: 31.5977660577446
EPOCH: 1999 LOSS: 31.597731763421052
EPOCH: 2000 LOSS: 31.597697510044227
EPOCH: 2001 LOSS: 31.597663297543292
EPOCH: 2002 LOSS: 31.59762912584756
EPOCH: 2003 LOSS: 31.597594994886535
EPOCH: 2004 LOSS: 31.59756090458987
EPOCH: 2005 LOSS: 31.59752685488736
EPOCH: 2006 LOSS: 31.59749284570897
EPOCH: 2007 LOSS: 31.597458876984838
EPOCH: 2008 LOSS: 31.59742494864522
EPOCH: 2009 LOSS: 31.597391060620595
EPOCH: 2010 LOSS: 31.597357212841516
EPOCH: 2011 LOSS: 31.59732340523876
EPOCH: 2012 LOSS: 31.597289637743216
EPOCH: 2013 LOSS: 31.597255910285966
EPOCH: 2014 LOSS: 31.597222222798212
EPOCH: 2015 LOSS: 31.597188575211334
EPOCH: 2016 LOSS: 31.59715496745684
EPOCH: 2017 LOSS: 31.597121399466428
EPOCH: 2018 LOSS: 31.59708787117193
EPOCH: 2019 LOSS: 31.597054382505327
EPOCH: 2020 

EPOCH: 2221 LOSS: 31.59101713059183
EPOCH: 2222 LOSS: 31.590990467050762
EPOCH: 2223 LOSS: 31.590963831937813
EPOCH: 2224 LOSS: 31.590937225209053
EPOCH: 2225 LOSS: 31.59091064682061
EPOCH: 2226 LOSS: 31.590884096728697
EPOCH: 2227 LOSS: 31.59085757488964
EPOCH: 2228 LOSS: 31.590831081259857
EPOCH: 2229 LOSS: 31.590804615795818
EPOCH: 2230 LOSS: 31.5907781784541
EPOCH: 2231 LOSS: 31.59075176919139
EPOCH: 2232 LOSS: 31.59072538796441
EPOCH: 2233 LOSS: 31.590699034730036
EPOCH: 2234 LOSS: 31.59067270944516
EPOCH: 2235 LOSS: 31.590646412066828
EPOCH: 2236 LOSS: 31.59062014255212
EPOCH: 2237 LOSS: 31.59059390085823
EPOCH: 2238 LOSS: 31.59056768694242
EPOCH: 2239 LOSS: 31.59054150076207
EPOCH: 2240 LOSS: 31.590515342274603
EPOCH: 2241 LOSS: 31.590489211437564
EPOCH: 2242 LOSS: 31.590463108208553
EPOCH: 2243 LOSS: 31.590437032545303
EPOCH: 2244 LOSS: 31.59041098440555
EPOCH: 2245 LOSS: 31.590384963747184
EPOCH: 2246 LOSS: 31.59035897052817
EPOCH: 2247 LOSS: 31.590333004706558
EPOCH: 2248 LOS

EPOCH: 2461 LOSS: 31.585343914381017
EPOCH: 2462 LOSS: 31.585322989062917
EPOCH: 2463 LOSS: 31.585302083745077
EPOCH: 2464 LOSS: 31.585281198399763
EPOCH: 2465 LOSS: 31.58526033299929
EPOCH: 2466 LOSS: 31.585239487515985
EPOCH: 2467 LOSS: 31.585218661922298
EPOCH: 2468 LOSS: 31.585197856190653
EPOCH: 2469 LOSS: 31.585177070293597
EPOCH: 2470 LOSS: 31.585156304203643
EPOCH: 2471 LOSS: 31.58513555789343
EPOCH: 2472 LOSS: 31.585114831335595
EPOCH: 2473 LOSS: 31.58509412450286
EPOCH: 2474 LOSS: 31.58507343736798
EPOCH: 2475 LOSS: 31.585052769903744
EPOCH: 2476 LOSS: 31.58503212208302
EPOCH: 2477 LOSS: 31.585011493878717
EPOCH: 2478 LOSS: 31.584990885263778
EPOCH: 2479 LOSS: 31.58497029621121
EPOCH: 2480 LOSS: 31.584949726694056
EPOCH: 2481 LOSS: 31.584929176685414
EPOCH: 2482 LOSS: 31.584908646158446
EPOCH: 2483 LOSS: 31.584888135086317
EPOCH: 2484 LOSS: 31.584867643442294
EPOCH: 2485 LOSS: 31.584847171199677
EPOCH: 2486 LOSS: 31.584826718331783
EPOCH: 2487 LOSS: 31.584806284812014
EPOCH: 

EPOCH: 2712 LOSS: 31.580654242178902
EPOCH: 2713 LOSS: 31.58063758155228
EPOCH: 2714 LOSS: 31.58062093529076
EPOCH: 2715 LOSS: 31.580604303376326
EPOCH: 2716 LOSS: 31.580587685790995
EPOCH: 2717 LOSS: 31.580571082516883
EPOCH: 2718 LOSS: 31.580554493536074
EPOCH: 2719 LOSS: 31.580537918830693
EPOCH: 2720 LOSS: 31.580521358382892
EPOCH: 2721 LOSS: 31.580504812174873
EPOCH: 2722 LOSS: 31.580488280188842
EPOCH: 2723 LOSS: 31.580471762407065
EPOCH: 2724 LOSS: 31.58045525881179
EPOCH: 2725 LOSS: 31.58043876938534
EPOCH: 2726 LOSS: 31.58042229411005
EPOCH: 2727 LOSS: 31.580405832968275
EPOCH: 2728 LOSS: 31.580389385942382
EPOCH: 2729 LOSS: 31.580372953014844
EPOCH: 2730 LOSS: 31.580356534168068
EPOCH: 2731 LOSS: 31.58034012938455
EPOCH: 2732 LOSS: 31.580323738646783
EPOCH: 2733 LOSS: 31.58030736193731
EPOCH: 2734 LOSS: 31.58029099923869
EPOCH: 2735 LOSS: 31.58027465053351
EPOCH: 2736 LOSS: 31.5802583158044
EPOCH: 2737 LOSS: 31.580241995033987
EPOCH: 2738 LOSS: 31.580225688204976
EPOCH: 2739 

EPOCH: 2992 LOSS: 31.57649190608006
EPOCH: 2993 LOSS: 31.576478658603854
EPOCH: 2994 LOSS: 31.57646542142306
EPOCH: 2995 LOSS: 31.576452194526034
EPOCH: 2996 LOSS: 31.57643897790119
EPOCH: 2997 LOSS: 31.576425771536925
EPOCH: 2998 LOSS: 31.57641257542165
EPOCH: 2999 LOSS: 31.57639938954382
EPOCH: 3000 LOSS: 31.57638621389189
EPOCH: 3001 LOSS: 31.576373048454325
EPOCH: 3002 LOSS: 31.576359893219646
EPOCH: 3003 LOSS: 31.576346748176327
EPOCH: 3004 LOSS: 31.576333613312904
EPOCH: 3005 LOSS: 31.57632048861793
EPOCH: 3006 LOSS: 31.576307374079967
EPOCH: 3007 LOSS: 31.57629426968757
EPOCH: 3008 LOSS: 31.576281175429344
EPOCH: 3009 LOSS: 31.576268091293905
EPOCH: 3010 LOSS: 31.576255017269872
EPOCH: 3011 LOSS: 31.5762419533459
EPOCH: 3012 LOSS: 31.576228899510625
EPOCH: 3013 LOSS: 31.576215855752736
EPOCH: 3014 LOSS: 31.57620282206095
EPOCH: 3015 LOSS: 31.57618979842395
EPOCH: 3016 LOSS: 31.576176784830466
EPOCH: 3017 LOSS: 31.576163781269248
EPOCH: 3018 LOSS: 31.57615078772904
EPOCH: 3019 LO

EPOCH: 3296 LOSS: 31.572890335507022
EPOCH: 3297 LOSS: 31.5728797523521
EPOCH: 3298 LOSS: 31.572869176624874
EPOCH: 3299 LOSS: 31.572858608317745
EPOCH: 3300 LOSS: 31.572848047423143
EPOCH: 3301 LOSS: 31.57283749393352
EPOCH: 3302 LOSS: 31.57282694784132
EPOCH: 3303 LOSS: 31.572816409139005
EPOCH: 3304 LOSS: 31.57280587781905
EPOCH: 3305 LOSS: 31.572795353873925
EPOCH: 3306 LOSS: 31.57278483729612
EPOCH: 3307 LOSS: 31.572774328078133
EPOCH: 3308 LOSS: 31.572763826212473
EPOCH: 3309 LOSS: 31.572753331691672
EPOCH: 3310 LOSS: 31.57274284450826
EPOCH: 3311 LOSS: 31.57273236465477
EPOCH: 3312 LOSS: 31.572721892123752
EPOCH: 3313 LOSS: 31.57271142690774
EPOCH: 3314 LOSS: 31.572700968999357
EPOCH: 3315 LOSS: 31.572690518391127
EPOCH: 3316 LOSS: 31.572680075075688
EPOCH: 3317 LOSS: 31.572669639045607
EPOCH: 3318 LOSS: 31.572659210293512
EPOCH: 3319 LOSS: 31.572648788812003
EPOCH: 3320 LOSS: 31.572638374593726
EPOCH: 3321 LOSS: 31.572627967631313
EPOCH: 3322 LOSS: 31.572617567917405
EPOCH: 332

EPOCH: 3597 LOSS: 31.570009147282306
EPOCH: 3598 LOSS: 31.57000049818113
EPOCH: 3599 LOSS: 31.569991854617868
EPOCH: 3600 LOSS: 31.56998321658739
EPOCH: 3601 LOSS: 31.56997458408452
EPOCH: 3602 LOSS: 31.569965957104117
EPOCH: 3603 LOSS: 31.569957335641035
EPOCH: 3604 LOSS: 31.569948719690146
EPOCH: 3605 LOSS: 31.56994010924634
EPOCH: 3606 LOSS: 31.569931504304456
EPOCH: 3607 LOSS: 31.56992290485941
EPOCH: 3608 LOSS: 31.569914310906086
EPOCH: 3609 LOSS: 31.569905722439366
EPOCH: 3610 LOSS: 31.569897139454184
EPOCH: 3611 LOSS: 31.569888561945426
EPOCH: 3612 LOSS: 31.569879989907996
EPOCH: 3613 LOSS: 31.569871423336856
EPOCH: 3614 LOSS: 31.56986286222689
EPOCH: 3615 LOSS: 31.569854306573077
EPOCH: 3616 LOSS: 31.569845756370327
EPOCH: 3617 LOSS: 31.569837211613585
EPOCH: 3618 LOSS: 31.569828672297813
EPOCH: 3619 LOSS: 31.56982013841797
EPOCH: 3620 LOSS: 31.56981160996903
EPOCH: 3621 LOSS: 31.56980308694594
EPOCH: 3622 LOSS: 31.569794569343706
EPOCH: 3623 LOSS: 31.56978605715728
EPOCH: 3624

EPOCH: 3901 LOSS: 31.567612970937038
EPOCH: 3902 LOSS: 31.567605793968873
EPOCH: 3903 LOSS: 31.56759862122161
EPOCH: 3904 LOSS: 31.567591452691634
EPOCH: 3905 LOSS: 31.56758428837532
EPOCH: 3906 LOSS: 31.567577128269072
EPOCH: 3907 LOSS: 31.56756997236929
EPOCH: 3908 LOSS: 31.56756282067239
EPOCH: 3909 LOSS: 31.567555673174738
EPOCH: 3910 LOSS: 31.567548529872788
EPOCH: 3911 LOSS: 31.567541390762933
EPOCH: 3912 LOSS: 31.567534255841593
EPOCH: 3913 LOSS: 31.567527125105205
EPOCH: 3914 LOSS: 31.567519998550175
EPOCH: 3915 LOSS: 31.56751287617294
EPOCH: 3916 LOSS: 31.567505757969954
EPOCH: 3917 LOSS: 31.56749864393763
EPOCH: 3918 LOSS: 31.567491534072417
EPOCH: 3919 LOSS: 31.567484428370772
EPOCH: 3920 LOSS: 31.56747732682913
EPOCH: 3921 LOSS: 31.567470229443966
EPOCH: 3922 LOSS: 31.567463136211714
EPOCH: 3923 LOSS: 31.567456047128868
EPOCH: 3924 LOSS: 31.567448962191857
EPOCH: 3925 LOSS: 31.567441881397173
EPOCH: 3926 LOSS: 31.567434804741303
EPOCH: 3927 LOSS: 31.567427732220708
EPOCH: 3

EPOCH: 4199 LOSS: 31.56564645135636
EPOCH: 4200 LOSS: 31.56564038836316
EPOCH: 4201 LOSS: 31.565634328671642
EPOCH: 4202 LOSS: 31.565628272279156
EPOCH: 4203 LOSS: 31.565622219183116
EPOCH: 4204 LOSS: 31.56561616938089
EPOCH: 4205 LOSS: 31.565610122869888
EPOCH: 4206 LOSS: 31.56560407964747
EPOCH: 4207 LOSS: 31.565598039711073
EPOCH: 4208 LOSS: 31.56559200305806
EPOCH: 4209 LOSS: 31.56558596968585
EPOCH: 4210 LOSS: 31.565579939591828
EPOCH: 4211 LOSS: 31.565573912773424
EPOCH: 4212 LOSS: 31.56556788922803
EPOCH: 4213 LOSS: 31.56556186895306
EPOCH: 4214 LOSS: 31.565555851945934
EPOCH: 4215 LOSS: 31.56554983820407
EPOCH: 4216 LOSS: 31.565543827724884
EPOCH: 4217 LOSS: 31.5655378205058
EPOCH: 4218 LOSS: 31.565531816544233
EPOCH: 4219 LOSS: 31.56552581583761
EPOCH: 4220 LOSS: 31.56551981838338
EPOCH: 4221 LOSS: 31.565513824178964
EPOCH: 4222 LOSS: 31.56550783322179
EPOCH: 4223 LOSS: 31.565501845509306
EPOCH: 4224 LOSS: 31.565495861038954
EPOCH: 4225 LOSS: 31.565489879808176
EPOCH: 4226 LOS

EPOCH: 4497 LOSS: 31.56397517082097
EPOCH: 4498 LOSS: 31.5639699867111
EPOCH: 4499 LOSS: 31.563964805229425
EPOCH: 4500 LOSS: 31.563959626373986
EPOCH: 4501 LOSS: 31.563954450142862
EPOCH: 4502 LOSS: 31.563949276534085
EPOCH: 4503 LOSS: 31.563944105545758
EPOCH: 4504 LOSS: 31.56393893717592
EPOCH: 4505 LOSS: 31.563933771422654
EPOCH: 4506 LOSS: 31.563928608284023
EPOCH: 4507 LOSS: 31.563923447758093
EPOCH: 4508 LOSS: 31.563918289842945
EPOCH: 4509 LOSS: 31.563913134536648
EPOCH: 4510 LOSS: 31.563907981837296
EPOCH: 4511 LOSS: 31.56390283174295
EPOCH: 4512 LOSS: 31.5638976842517
EPOCH: 4513 LOSS: 31.56389253936163
EPOCH: 4514 LOSS: 31.56388739707083
EPOCH: 4515 LOSS: 31.563882257377372
EPOCH: 4516 LOSS: 31.563877120279336
EPOCH: 4517 LOSS: 31.563871985774856
EPOCH: 4518 LOSS: 31.563866853861985
EPOCH: 4519 LOSS: 31.56386172453884
EPOCH: 4520 LOSS: 31.563856597803504
EPOCH: 4521 LOSS: 31.56385147365408
EPOCH: 4522 LOSS: 31.563846352088667
EPOCH: 4523 LOSS: 31.563841233105347
EPOCH: 4524 

EPOCH: 4802 LOSS: 31.56250739102384
EPOCH: 4803 LOSS: 31.562502926592785
EPOCH: 4804 LOSS: 31.562498464275535
EPOCH: 4805 LOSS: 31.56249400407066
EPOCH: 4806 LOSS: 31.562489545976682
EPOCH: 4807 LOSS: 31.562485089992144
EPOCH: 4808 LOSS: 31.562480636115595
EPOCH: 4809 LOSS: 31.56247618434558
EPOCH: 4810 LOSS: 31.562471734680667
EPOCH: 4811 LOSS: 31.562467287119386
EPOCH: 4812 LOSS: 31.562462841660277
EPOCH: 4813 LOSS: 31.562458398301896
EPOCH: 4814 LOSS: 31.562453957042823
EPOCH: 4815 LOSS: 31.562449517881593
EPOCH: 4816 LOSS: 31.562445080816765
EPOCH: 4817 LOSS: 31.562440645846884
EPOCH: 4818 LOSS: 31.562436212970525
EPOCH: 4819 LOSS: 31.562431782186238
EPOCH: 4820 LOSS: 31.562427353492588
EPOCH: 4821 LOSS: 31.562422926888132
EPOCH: 4822 LOSS: 31.56241850237145
EPOCH: 4823 LOSS: 31.562414079941096
EPOCH: 4824 LOSS: 31.562409659595634
EPOCH: 4825 LOSS: 31.562405241333625
EPOCH: 4826 LOSS: 31.56240082515365
EPOCH: 4827 LOSS: 31.56239641105429
EPOCH: 4828 LOSS: 31.562391999034087
EPOCH: 

In [None]:
SkipGramNumpy()

In [None]:
# FORWARD PASS
def forward_pass(self, x):
    h = np.dot(self.w1.T, x)
    u = np.dot(self.w2.T, h)
    y_c = softmax(u)
    return y_c, h, u
    ############## END

In [None]:
  # BACKPROPAGATION
def backprop(self, e, h, x):
    dl_dw2 = np.outer(h, e)  
    dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
    ############## YOUR CODE HERE
    # UPDATE WEIGHTS
    self.w1 = self.w1 - (self.eta * dl_dw1)
    self.w2 = self.w2 - (self.eta * dl_dw2)
    ############## EN

In [None]:
    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:
                
                #### FILL THE BLANK(S) ####             
                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                
                ############## END
            print('EPOCH:',i, 'LOSS:', self.loss)



# From online

In [13]:
# Generate training data
import re
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

def generate_training_data(tokens, word_to_id, window_size):
    N = len(tokens)
    X, Y = [], []

    for i in range(N):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(N, i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
            
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)

    return X, Y


doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."
tokens = tokenize(doc)
word_to_id, id_to_word = mapping(tokens)
X, Y = generate_training_data(tokens, word_to_id, 3)
vocab_size = len(id_to_word)
m = Y.shape[1]
# turn Y into one hot encoding
Y_one_hot = np.zeros((vocab_size, m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1

In [7]:
def initialize_wrd_emb(vocab_size, emb_size):
    """
    vocab_size: int. vocabulary size of your corpus or training data
    emb_size: int. word embedding size. How many dimensions to represent each vocabulary
    """
    WRD_EMB = np.random.randn(vocab_size, emb_size) * 0.01
    return WRD_EMB

def initialize_dense(input_size, output_size):
    """
    input_size: int. size of the input to the dense layer
    output_szie: int. size of the output out of the dense layer
    """
    W = np.random.randn(output_size, input_size) * 0.01
    return W

def initialize_parameters(vocab_size, emb_size):
    """
    initialize all the trianing parameters
    """
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    
    return parameters

In [8]:
# Forward propagation
def ind_to_word_vecs(inds, parameters):
    """
    inds: numpy array. shape: (1, m)
    parameters: dict. weights to be trained
    """
    m = inds.shape[1]
    WRD_EMB = parameters['WRD_EMB']
    word_vec = WRD_EMB[inds.flatten(), :].T
    
    assert(word_vec.shape == (WRD_EMB.shape[1], m))
    
    return word_vec

def linear_dense(word_vec, parameters):
    """
    word_vec: numpy array. shape: (emb_size, m)
    parameters: dict. weights to be trained
    """
    m = word_vec.shape[1]
    W = parameters['W']
    Z = np.dot(W, word_vec)
    
    assert(Z.shape == (W.shape[0], m))
    
    return W, Z

def softmax(Z):
    """
    Z: output out of the dense layer. shape: (vocab_size, m)
    """
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0, keepdims=True) + 0.001)
    
    assert(softmax_out.shape == Z.shape)

    return softmax_out

def forward_propagation(inds, parameters):
    word_vec = ind_to_word_vecs(inds, parameters)
    W, Z = linear_dense(word_vec, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vec'] = word_vec
    caches['W'] = W
    caches['Z'] = Z
    
    return softmax_out, caches

In [9]:
# Cost
def cross_entropy(softmax_out, Y):
    """
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    m = softmax_out.shape[1]
    cost = -(1 / m) * np.sum(np.sum(Y * np.log(softmax_out + 0.001), axis=0, keepdims=True), axis=1)
    return cost

In [11]:
# Backward propagation
def softmax_backward(Y, softmax_out):
    """
    Y: labels of training data. shape: (vocab_size, m)
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    dL_dZ = softmax_out - Y
    
    assert(dL_dZ.shape == softmax_out.shape)
    return dL_dZ

def dense_backward(dL_dZ, caches):
    """
    dL_dZ: shape: (vocab_size, m)
    caches: dict. results from each steps of forward propagation
    """
    W = caches['W']
    word_vec = caches['word_vec']
    m = word_vec.shape[1]
    
    dL_dW = (1 / m) * np.dot(dL_dZ, word_vec.T)
    dL_dword_vec = np.dot(W.T, dL_dZ)

    assert(W.shape == dL_dW.shape)
    assert(word_vec.shape == dL_dword_vec.shape)
    
    return dL_dW, dL_dword_vec

def backward_propagation(Y, softmax_out, caches):
    dL_dZ = softmax_backward(Y, softmax_out)
    dL_dW, dL_dword_vec = dense_backward(dL_dZ, caches)
    
    gradients = dict()
    gradients['dL_dZ'] = dL_dZ
    gradients['dL_dW'] = dL_dW
    gradients['dL_dword_vec'] = dL_dword_vec
    
    return gradients

def update_parameters(parameters, caches, gradients, learning_rate):
    vocab_size, emb_size = parameters['WRD_EMB'].shape
    inds = caches['inds']
    WRD_EMB = parameters['WRD_EMB']
    dL_dword_vec = gradients['dL_dword_vec']
    m = inds.shape[-1]
    
    WRD_EMB[inds.flatten(), :] -= dL_dword_vec.T * learning_rate

    parameters['W'] -= learning_rate * gradients['dL_dW']

In [12]:
# Train model
def skipgram_model_training(X, Y, vocab_size, emb_size, learning_rate, epochs, batch_size=256, parameters=None, print_cost=True, plot_cost=True):
    """
    X: Input word indices. shape: (1, m)
    Y: One-hot encodeing of output word indices. shape: (vocab_size, m)
    vocab_size: vocabulary size of your corpus or training data
    emb_size: word embedding size. How many dimensions to represent each vocabulary
    learning_rate: alaph in the weight update formula
    epochs: how many epochs to train the model
    batch_size: size of mini batch
    parameters: pre-trained or pre-initialized parameters
    print_cost: whether or not to print costs during the training process
    """
    costs = []
    m = X.shape[1]
    
    if parameters is None:
        parameters = initialize_parameters(vocab_size, emb_size)
    
    for epoch in range(epochs):
        epoch_cost = 0
        batch_inds = list(range(0, m, batch_size))
        np.random.shuffle(batch_inds)
        for i in batch_inds:
            X_batch = X[:, i:i+batch_size]
            Y_batch = Y[:, i:i+batch_size]

            softmax_out, caches = forward_propagation(X_batch, parameters)
            gradients = backward_propagation(Y_batch, softmax_out, caches)
            update_parameters(parameters, caches, gradients, learning_rate)
            cost = cross_entropy(softmax_out, Y_batch)
            epoch_cost += np.squeeze(cost)
            
        costs.append(epoch_cost)
        if print_cost and epoch % (epochs // 500) == 0:
            print("Cost after epoch {}: {}".format(epoch, epoch_cost))
        if epoch % (epochs // 100) == 0:
            learning_rate *= 0.98
            
    if plot_cost:
        plt.plot(np.arange(epochs), costs)
        plt.xlabel('# of epochs')
        plt.ylabel('cost')
    return parameters