## Word Embedding Experiments with the Wikipedia Corpus using PMI Embeddings 

### Pre-processing 

In [3]:
import nltk
from numpy.linalg import inv
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/varshant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/varshant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
f = open("/project/cmsc25025/wikipedia/wiki-text.txt", "r")
wiki_contents = f.read()
stop_words = set(stopwords.words('english'))

In [5]:
corpus = wiki_contents.split(' ')
words = list(corpus)

In [6]:
words = [w for w in words if w not in stop_words]

In [7]:
fdist = nltk.FreqDist(words)

In [8]:
vocab_word = sorted(w for w in set(words) if fdist[w] > 500)

In [9]:
index_dict = {};
for idx, word in enumerate(vocab_word):
    index_dict[word] = idx
vocab_len = len(vocab_word)
vocab_word = set(vocab_word)

In [10]:
corpus = [' ', ' ', ' ', ' ', ' '] + corpus
corpus = corpus + [' ', ' ', ' ', ' ', ' ']

In [11]:
context_word_arr = np.zeros((vocab_len, vocab_len));

### Compute PMI 

In [12]:
for indx, word in enumerate(words):
    if word in vocab_word:
        i = index_dict[word]
        for k in range(1,5):
            if words[indx - k] in vocab_word:
                j = index_dict[words[indx - k]]
                context_word_arr[i, j] += 1
            if words[indx + k] in vocab_word:
                j = index_dict[words[indx + k]]
                context_word_arr[i, j] += 1
    if (indx % 10000 == 0):
        print indx
        

 

In [13]:
num_pairs = np.count_nonzero(context_word_arr)
totals = np.sum(context_word_arr, axis=1)

vocab_len = len(vocab_word)
M = np.zeros((vocab_len, vocab_len))
for index, x in np.ndenumerate(M):
    i = index[0]
    j = index[1]
    M[i, j] = np.log(((context_word_arr[i, j] + 1) * num_pairs ) / (totals[i] * totals[j]))
        

### k-SVD of PMI Matrix 

In [14]:
import scipy
U, s, V = scipy.sparse.linalg.svds(scipy.sparse.csr_matrix(M), k=50)

In [15]:
s = np.diag(s)
W = np.dot(U, np.sqrt(s))
import pickle 
np.save('embeddings', W)
W.shape

(13201, 50)

### Find similarities 

In [16]:
index_word = {}
for key in index_dict.keys():
    index_word[index_dict[key]] = key

In [17]:
def get_similar(word, n=5):
    similarities = np.dot(U, U[index_dict[word]])
    most_similar = similarities.argsort()
    most_similar = most_similar[::-1]
    for i in range(n):
        print(index_word[most_similar[i]])

In [18]:
get_similar('physics')

physics
science
mathematics
university
theory


In [19]:
get_similar('republican')

republican
election
party
senator
democratic


In [20]:
get_similar('einstein')

physics
theory
quantum
mathematics
mathematical


In [21]:
get_similar('algebra')

theorem
algebra
frac
vector
finite


In [22]:
get_similar('fish')

species
food
fish
fruit
birds


### Finding linear analogies

In [23]:
def analogies(w1, w2, w3):
    v1 = U[index_dict[w1]]
    v2 = U[index_dict[w2]]
    v3 = U[index_dict[w3]]
    v = v1 - v2 + v3
    most_similar = np.dot(U, v)
    most_similar = most_similar.argsort()
    most_similar = most_similar[::-1]
    
    for i in range(5):
        print(index_word[most_similar[i]])

In [24]:
print("france : paris :: england : london")
analogies('paris', 'france', 'england' )

france : paris :: england : london
england
london
scotland
wales
st


In [25]:
print("republican : democrat :: conservative : liberal")
analogies('republican', 'democrat', 'conservative' )

republican : democrat :: conservative : liberal
party
conservative
liberal
election
republican


In [26]:
print("thief : bad :: child : daughter")
analogies('thief', 'bad', 'child' )

thief : bad :: child : daughter
daughter
mother
child
wife
marriage


In [27]:
print("nearby : remote :: native : language")
analogies('remote', 'nearby', 'native' )

nearby : remote :: native : language
native
language
languages
african
people


In [28]:
print("happy : smile :: pain : disease")
analogies('happy', 'smile', 'pain' )

happy : smile :: pain : disease
disease
symptoms
pain
patients
heart


In [29]:
print("sad : anger :: happy : love")
analogies('sad', 'anger', 'happy' )

sad : anger :: happy : love
want
happy
love
girl
song
