In [1]:
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
from word2vec import *
from sgd import *
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

In [2]:
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
dimVectors = 10
C = 5
random.seed(31415)
np.random.seed(9265)

In [3]:
startTime=time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) /
       dimVectors, np.zeros((nWords, dimVectors))),
    axis=0)

In [4]:
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
        negSamplingLossAndGradient),
    wordVectors, 0.3, 5010, None, True, PRINT_EVERY=10)
wordVectors = np.concatenate(
    (wordVectors[:nWords,:], wordVectors[nWords:,:]),
    axis=0)
visualizeWords = [
    "great", "cool", "brilliant", "wonderful", "well", "amazing",
    "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
    "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
    "hail", "coffee", "tea"]
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()

plt.savefig('word_vectors_5000.png')

iter 10: 19.824024
iter 20: 20.136629
iter 30: 20.151500
iter 40: 20.211374
iter 50: 20.245379
iter 60: 20.300555
iter 70: 20.322475
iter 80: 20.495801
iter 90: 20.568950
iter 100: 20.531704
iter 110: 20.580190
iter 120: 20.618621
iter 130: 20.609395
iter 140: 20.585372
iter 150: 20.554925
iter 160: 20.564125
iter 170: 20.771104
iter 180: 20.853380
iter 190: 20.923904
iter 200: 20.929908
iter 210: 20.889862
iter 220: 20.905183
iter 230: 20.942612
iter 240: 21.031543
iter 250: 20.902543
iter 260: 20.856234
iter 270: 20.972356
iter 280: 21.174174
iter 290: 21.221034
iter 300: 21.318938
iter 310: 21.404313
iter 320: 21.538800
iter 330: 21.559794
iter 340: 21.541619
iter 350: 21.646352
iter 360: 21.684853
iter 370: 21.599438
iter 380: 21.617426
iter 390: 21.695497
iter 400: 21.731549
iter 410: 21.712414
iter 420: 21.686614
iter 430: 21.593469
iter 440: 21.535491
iter 450: 21.678669
iter 460: 21.692681
iter 470: 21.683106
iter 480: 21.750270
iter 490: 21.791177
iter 500: 21.730937
iter 510:

In [None]:
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
        negSamplingLossAndGradient),
    wordVectors, 0.3, 10010, None, True, PRINT_EVERY=10)

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()

plt.savefig('word_vectors_10000.png')

iter 5010: 19.024904
iter 5020: 18.951881
iter 5030: 18.913468
iter 5040: 18.937921
iter 5050: 18.926978
iter 5060: 19.003211
iter 5070: 18.796006
iter 5080: 18.761365
iter 5090: 18.755197
iter 5100: 18.602518
iter 5110: 18.525467
iter 5120: 18.428549
iter 5130: 18.444131
iter 5140: 18.346951
iter 5150: 18.211370
iter 5160: 18.170959
iter 5170: 18.201319
iter 5180: 18.115191
iter 5190: 18.021061
iter 5200: 17.948370
iter 5210: 17.922004
iter 5220: 17.763964
iter 5230: 17.602763
iter 5240: 17.642048
iter 5250: 17.665594
iter 5260: 17.547769
iter 5270: 17.506831
iter 5280: 17.497244
iter 5290: 17.312700
iter 5300: 17.282260
iter 5310: 17.285574
iter 5320: 17.293375
iter 5330: 17.235725
iter 5340: 17.091635
iter 5350: 16.967858
iter 5360: 17.036539
iter 5370: 17.025802
iter 5380: 17.187536
iter 5390: 17.326044
iter 5400: 17.301443
iter 5410: 17.221136
iter 5420: 17.170783
iter 5430: 17.230239
iter 5440: 17.128738
iter 5450: 17.086734
iter 5460: 17.126861
iter 5470: 17.188184
iter 5480: 17

In [None]:
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
        negSamplingLossAndGradient),
    wordVectors, 0.3, 20010, None, True, PRINT_EVERY=10)

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()

plt.savefig('word_vectors_20000.png')

iter 10010: 14.439691
iter 10020: 14.434166
iter 10030: 14.408895
iter 10040: 14.346396
iter 10050: 14.290566
iter 10060: 14.194201
iter 10070: 14.123921
iter 10080: 14.143321
iter 10090: 14.104206
iter 10100: 14.011838
iter 10110: 13.966096
iter 10120: 13.819112
iter 10130: 13.672750
iter 10140: 13.574072
iter 10150: 13.564489
iter 10160: 13.461721
iter 10170: 13.485541
iter 10180: 13.457385
iter 10190: 13.420606
iter 10200: 13.359894
iter 10210: 13.324715
iter 10220: 13.293154
iter 10230: 13.226915
iter 10240: 13.231094
iter 10250: 13.257420
iter 10260: 13.201018
iter 10270: 13.248663
iter 10280: 13.221749
iter 10290: 13.141112
iter 10300: 13.291796
iter 10310: 13.258647
iter 10320: 13.323671
iter 10330: 13.383169
iter 10340: 13.404074
iter 10350: 13.478826
iter 10360: 13.496350
iter 10370: 13.484071
iter 10380: 13.519458
iter 10390: 13.626474
iter 10400: 13.508596
iter 10410: 13.515684
iter 10420: 13.392330
iter 10430: 13.437986
iter 10440: 13.410067
iter 10450: 13.377870
iter 10460

In [None]:
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
        negSamplingLossAndGradient),
    wordVectors, 0.3, 30010, None, True, PRINT_EVERY=10)

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()

plt.savefig('word_vectors_30000.png')

In [None]:
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
        negSamplingLossAndGradient),
    wordVectors, 0.3, 40010, None, True, PRINT_EVERY=10)

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in range(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.show()

plt.savefig('word_vectors_40000.png')