# Word Embedding Experiments

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import time
import itertools
import math
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
from scipy.sparse import linalg
from numpy import linalg as LA
import pickle

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/wangzh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
with open('./wiki-text.txt','r') as f:
    text = [line.split() for line in f]  

In [5]:
len(text[0])

124301826

## Data preprocessing

In [6]:
from collections import Counter
c = Counter(text[0])

In [7]:
n = 500
start = time.time()
text_rm_stop_words= [word for word in text[0] if word not in stop_words]
text_filtered = [word for word in text_rm_stop_words if c[word] > n]
end = time.time()
print("time elapsed: ", end-start)

('time elapsed: ', 43.230916023254395)


In [8]:
# Tailored vocab list
vocab = list(set(text_filtered))
len(vocab)

13201

In [9]:
# Create the word to index dictionary
index = {}
for i in range(len(vocab)):
    index[vocab[i]] = i

In [10]:
index

{'croatia': 0,
 'mic': 1,
 'sorts': 2,
 'happen': 3,
 'makes': 4,
 'auction': 5,
 'dissertation': 6,
 'dedicated': 7,
 'retrieved': 8,
 'golf': 9,
 'notices': 10,
 'insignia': 11,
 'sermons': 12,
 'admiral': 13,
 'prejudice': 14,
 'cooper': 15,
 'dye': 16,
 'sauron': 17,
 'meeting': 18,
 'heather': 19,
 'sunny': 20,
 'coptic': 21,
 'manages': 22,
 'exotic': 23,
 'algebra': 24,
 'argentina': 25,
 'doubles': 26,
 'smoke': 27,
 'boer': 28,
 'patterns': 29,
 'jesus': 30,
 'offices': 31,
 'likely': 32,
 'brady': 33,
 'family': 34,
 'absent': 35,
 'jesse': 36,
 'patches': 37,
 'joseph': 38,
 'drink': 39,
 'wyoming': 40,
 'protocol': 41,
 'footage': 42,
 'turkey': 43,
 'solar': 44,
 'cobra': 45,
 'locked': 46,
 'tutorial': 47,
 'hoping': 48,
 'write': 49,
 'dense': 50,
 'conditional': 51,
 'escapes': 52,
 'transported': 53,
 'wonderful': 54,
 'onset': 55,
 'grandfather': 56,
 'textile': 57,
 'packet': 58,
 'peer': 59,
 'tell': 60,
 'wood': 61,
 'stockton': 62,
 'guiding': 63,
 'rolling': 64,


## PMI Embedding

In [10]:
len(text_filtered)

71618337

In [11]:
# Working on small text size first
text_t = text_filtered[:50000]
vocab_t = list(set(text_t))
index_t = {}
for i in range(len(vocab_t)):
    index_t[vocab_t[i]] = i

### (a) Compute PMI matrix

In [10]:
# Function to compute the PMI matrix
def PMI_matrix(text, vocab, index):
    M = lil_matrix((len(vocab), len(vocab)), dtype=np.float64)
    for i in range(len(text)-5):
        window = text[i:i+6]
        window_pairs = list(itertools.permutations(window, 2))
        for pair in window_pairs:
            M[index[pair[0]], index[pair[1]]] += 1
        if not i % 500000:
            print(i/500000)
    num_Sp = M.count_nonzero()
    N_w = M.sum(axis = 1)
    D = np.diagflat(1/N_w)
    M = (M.toarray() + 1)*num_Sp
    M = np.dot(D,M).dot(D)
    M = np.log(M)
    return M

In [11]:
MATRIX = PMI_matrix(text_filtered, vocab, index)
# Printing the progress...

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143


### (b). (c). Factorize the PMI matrix to get word embeddings

In [11]:
# Function to factorize the PMI matrix to get the word embeddings marix W
def get_embeddings(M):
    # Factorize matrix M
    U, s, V = linalg.svds(csr_matrix(M), k = 50)
    S = np.diag((np.sqrt(s)))
    W = np.dot(U, S)
    return W

In [14]:
EMBEDDING = get_embeddings(MATRIX)

In [35]:
with open('embedding_matrix', 'wb') as f:
    pickle.dump(EMBEDDING, f)

### (d). Find the 5 closest words

In [12]:
with open('embedding_matrix', 'rb') as f:
    EMBEDDING = pickle.load(f)

In [13]:
# Function to return the top (n-1) closest words, taking vector as input
def n_closest(v, n, ind):
    top = []
    v_m = np.array([v,]*len(vocab))
    diff_m = EMBEDDING - v_m
    diff_norm = LA.norm(diff_m, axis = 1)
    closest = np.argpartition(diff_norm, n)[:n]
    for i in closest:
        if i != ind:
            top.append(vocab[i])
    return top

In [14]:
# Function to return the top 5 closest words, taking words as input
def top5_closest(word):
    ind = index[word]
    v = EMBEDDING[ind]
    return n_closest(v, 6, ind)

In [15]:
top5_closest('physics')

['quantum', 'chemistry', 'mechanics', 'mathematics', 'theoretical']

In [16]:
top5_closest('republican')

['senator', 'democrat', 'democrats', 'republicans', 'presidential']

In [19]:
top5_closest('einstein')

['planck', 'physicists', 'relativity', 'paradox', 'leibniz']

In [20]:
top5_closest('algebra')

['algebraic', 'calculus', 'finite', 'theorem', 'topology']

In [21]:
top5_closest('fish')

['eat', 'eggs', 'fruit', 'meat', 'seeds']

### (e). Solve Analogy

In [22]:
# analogy = ((X,Y),Z)
def solve_analogy(analogy):
    ind_X = index[analogy[0][0]]
    ind_Y = index[analogy[0][1]]
    ind_Z = index[analogy[1]]
    vec_X = EMBEDDING[ind_X]
    vec_Y = EMBEDDING[ind_Y]
    vec_Z = EMBEDDING[ind_Z]
    v = vec_Y - vec_X + vec_Z
    top5 = n_closest(v, 5, -1)
    return top5

In [23]:
solve_analogy((('france','paris'),'england'))

['england', 'london', 'oxford', 'dublin', 'edinburgh']

In [24]:
solve_analogy((('republican','democratic'),'conservative'))

['democratic', 'conservative', 'socialist', 'liberal', 'democracy']

In [25]:
solve_analogy((('china','beijing'),'japan'))

['tokyo', 'shanghai', 'osaka', 'seoul', 'beijing']

In [26]:
solve_analogy((('female','woman'),'male'))

['child', 'woman', 'mother', 'parents', 'lover']

In [29]:
solve_analogy((('us','chicago'),'china'))

['china', 'hong', 'tokyo', 'asia', 'beijing']