In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import time

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinyuw/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
with open('./wiki-text.txt','r') as f:
    text = [line.split() for line in f]  

In [5]:
len(text[0])

124301826

# Data preprocessing

In [6]:
from collections import Counter
c_text = Counter(text[0])

In [7]:
min_threshold = 500
#text_filter1 = [k for k, v in c_text.items() if v > min_threshold]
start = time.time()
text_fil1= [word for word in text[0] if not word in stop_words]
text_filtered = [k for k in text_fil1 if c_text[k] > min_threshold]
end = time.time()
print("time elapsed: " + str(end-start))
print(len(set(text_filtered)))


time elapsed: 21.671967029571533
13201


In [24]:
vocab = list(set(text_filtered))

In [25]:
index = {}
for i in range(len(vocab)):
    index[vocab[i]] = i

In [36]:
index

{'renewed': 0,
 'masculine': 1,
 'culturally': 2,
 'induce': 3,
 'township': 4,
 'gypsy': 5,
 'angular': 6,
 'president': 7,
 'syllable': 8,
 'eliminated': 9,
 'march': 10,
 'turns': 11,
 'accents': 12,
 'agnes': 13,
 'peaceful': 14,
 'actor': 15,
 'person': 16,
 'cookies': 17,
 'imposed': 18,
 'te': 19,
 'op': 20,
 'alpha': 21,
 'explain': 22,
 'rue': 23,
 'browns': 24,
 'minimize': 25,
 'relatively': 26,
 'compact': 27,
 'southwestern': 28,
 'natives': 29,
 'aug': 30,
 'mumbai': 31,
 'count': 32,
 'positioned': 33,
 'east': 34,
 'dana': 35,
 'treaties': 36,
 'risen': 37,
 'kate': 38,
 'eligible': 39,
 'inventor': 40,
 'greenwich': 41,
 'mercury': 42,
 'selective': 43,
 'dinosaurs': 44,
 'receive': 45,
 'black': 46,
 'successors': 47,
 'underway': 48,
 'sky': 49,
 'excluding': 50,
 'fathers': 51,
 'spent': 52,
 'loud': 53,
 'continents': 54,
 'brazil': 55,
 'bel': 56,
 'paint': 57,
 'navy': 58,
 'rectangular': 59,
 'profiles': 60,
 'establishment': 61,
 'orbital': 62,
 'campaign': 63,

# PMI Embedding

In [66]:
text_t = text_filtered[:500]
#vocab_t = set(text_t)

## First, create a word count matrix

In [10]:
s = "to be or not to be"

In [67]:
import itertools
pairs = list(itertools.permutations(vocab, 2))

In [71]:
c_pairs = Counter(pairs)

In [15]:
c_pairs

Counter({('renewed', 'masculine'): 1,
         ('renewed', 'culturally'): 1,
         ('renewed', 'induce'): 1,
         ('renewed', 'township'): 1,
         ('renewed', 'gypsy'): 1,
         ('renewed', 'angular'): 1,
         ('renewed', 'president'): 1,
         ('renewed', 'syllable'): 1,
         ('renewed', 'eliminated'): 1,
         ('renewed', 'march'): 1,
         ('renewed', 'turns'): 1,
         ('renewed', 'accents'): 1,
         ('renewed', 'agnes'): 1,
         ('renewed', 'peaceful'): 1,
         ('renewed', 'actor'): 1,
         ('renewed', 'person'): 1,
         ('renewed', 'cookies'): 1,
         ('renewed', 'imposed'): 1,
         ('renewed', 'te'): 1,
         ('renewed', 'op'): 1,
         ('renewed', 'alpha'): 1,
         ('renewed', 'explain'): 1,
         ('renewed', 'rue'): 1,
         ('renewed', 'browns'): 1,
         ('renewed', 'minimize'): 1,
         ('renewed', 'relatively'): 1,
         ('renewed', 'compact'): 1,
         ('renewed', 'southwestern'): 1,

In [72]:
# Count Np(wi,wj)
start = time.time()
for i in range(len(text_t)-5):
    window = text_t[i:i+6]
    window_pairs = list(itertools.permutations(window, 2))
    for pair in window_pairs:
        c_pairs[pair] += 1
end = time.time()
print("time elapsed: " + str(end-start))

time elapsed: 0.009650230407714844


In [76]:
c_pairs[(vocab[1209],vocab[11])]

1

In [73]:
# Count N(Sp)
Sp = [pair for pair in pairs if c_pairs[pair] > 1]
num_Sp = len(Sp)

In [74]:
num_Sp

4572

In [68]:
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
M = lil_matrix((len(vocab), len(vocab)), dtype=np.int8)

In [42]:
M.shape

(13201, 13201)

In [69]:
# Count Np(wi,wj)
start = time.time()
for i in range(len(text_t)-5):
    window = text_t[i:i+6]
    window_pairs = list(itertools.permutations(window, 2))
    for pair in window_pairs:
        M[index[pair[0]], index[pair[1]]] += 1
end = time.time()
print("time elapsed: " + str(end-start))

time elapsed: 0.13599681854248047


In [34]:
M = M.tocsr()

In [70]:
M.count_nonzero()

4584

In [47]:
nonzero = M.nonzero()

In [48]:
nonzero

(array([   11,    11,    11, ..., 13189, 13189, 13189], dtype=int32),
 array([ 1209,  1732,  2287, ...,  6473, 12685, 13038], dtype=int32))

In [49]:
M[11,1209]

5

In [55]:
M[1209,11]

5

In [56]:
N_w = M.sum(axis = 1)

In [61]:
N_w[11]

matrix([[60]], dtype=int64)