In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import time
import itertools
from itertools import combinations,permutations
import scipy

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/wangzh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
with open('../wiki-text.txt','r') as f:
    text = [line.split() for line in f]  

In [10]:
len(text[0])

124301826

# Data preprocessing

In [11]:
from collections import Counter
c_text = Counter(text[0])

In [12]:
min_threshold = 500
#text_filter1 = [k for k, v in c_text.items() if v > min_threshold]
start = time.time()
text_fil1= [word for word in text[0] if not word in stop_words]
text_filtered = [k for k in text_fil1 if c_text[k] > min_threshold]
end = time.time()
print("time elapsed: " + str(end-start))
print(len(set(text_filtered)))


time elapsed: 28.8972709179
13201


In [13]:
#text_filtered = text_filtered[:500]
vocab = list(set(text_filtered))

# PMI Embedding

###text_t = text_filtered[:5000]
vocab_t = list(set(text_t))
len(vocab_t)

In [16]:
vocab_index = {}
for i,w in enumerate(vocab):
    vocab_index[w] = i

## First, create a word count matrix

In [14]:
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix


In [19]:
def my_WW():
    start = time.time()
    text_t = text_filtered
    N = len(text_t)
    window_size = 5
 
    WW = lil_matrix((len(vocab),len(vocab)), dtype = np.float64)
    
    for i, w in enumerate(text_t):
        stepsize = min(window_size,N-i)
        window = text_t[i:i+stepsize+1]
        pairs = list(itertools.permutations(window,2))
        for p in pairs:
            WW[vocab_index[p[0]],vocab_index[p[1]]] += 1
    
    end = time.time()
 #   print("all stages complete.")
    print("time elapsed: "+ str(end-start))
    return(WW)





    
    
                

        

## Compute PMI matrix

In [20]:
start = time.time()
WW = my_WW()
SP = WW.count_nonzero()
WW = WW.toarray()
Ni = WW.sum(axis = 1)
Nis = np.diagflat(1/Ni)
M = (WW+1)*SP
M = np.dot(Nis,M).dot(Nis)
M = np.log(M)
end = time.time()
print("time elapsed: "+ str(end - start))

time elapsed: 0.127188920975
time elapsed: 0.348671913147


# (b) k-SVD

In [22]:
start = time.time()
U,s,V = scipy.sparse.linalg.svds(scipy.sparse.csr_matrix(M),k = 50)
end = time.time()
print("time elapsed: "+ str(end - start))

time elapsed: 0.688757896423


# (c) Word Embedding Matrix

In [24]:
start = time.time()
ss = np.diag(np.sqrt(s))
W = np.dot(U,ss)
end = time.time()
print("time elapsed: "+ str(end - start))

time elapsed: 0.000687837600708


# (d) Find closed words

In [27]:
def closest_word(word,W,num=6):
    print("closest words to "+ word, "are: ")
    start = time.time()
    key = vocab_index[word]
    vec = W[key]
    diff_W = W - np.array([vec]*W.shape[0])
    diff = [np.dot(x,x) for x in diff_W]
    close = np.argpartition(diff,num)[:num]
    
    for i in close:
        if i!= key:
            print(vocab[i])
    end = time.time()
    print("time elapsed: "+ str(end-start))
    

In [28]:
closest_word("abuse",W,6)

('closest words to ', 'abuse', 'are: ')
wish
complete
owners
limited
possession
('time elapsed: ', '0.00200796127319')


In [None]:
closest_word("physics",W,6)

In [None]:
closest_word("republican",W,6)

In [None]:
closest_word("einstein",W,6)

In [None]:
closest_word("algebra",W,6)

In [None]:
closest_word("fish",W,6)

# (e) Solve Analogies 

In [14]:
def solve_analogies(X,Y,Z,W,num=6):
    print("closest solution for analogy: ", X,":",Y,"::",Z,":","?")
    vec = W[vocab_index[Y]] - W[vocab_index[X]] + W[vocab_index[Z]]
    diff_W = W - np.array([vec]*W.shape[0])
    diff = [np.dot(x,x) for x in diff_W]
    close = np.argpartition(diff,num)[:num]
    
    for i in close:
        print(vocab[i])

In [17]:
X = 'france'
Y = 'paris'
Z = 'england'
solve_analogies(X,Y,Z,W)

('closest solution for analogy: ', 'france', ':', 'paris', '::', 'england', ':', '?')
england
london
oxford
dublin
cambridge
edinburgh


In [10]:
i = 5
j = 199
print(float(i)/j)

0.0251256281407


In [6]:
float(3//5)

0.0

In [2]:
import pickle
with open("my_W_0","rb") as f:
    W = pickle.load(f)

In [4]:
W.shape

(13201, 50)