### reference
- https://multithreaded.stitchfix.com/blog/2017/10/18/stop-using-word2vec/
- http://camberbridge.github.io/2016/07/08/自己相互情報量-Pointwise-Mutual-Information-PMI-について/
- https://www.kaggle.com/hacker-news/hacker-news-corpus

In [1]:
import math
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from tqdm import tqdm

In [2]:
df = pd.read_csv("../input/hacker_news_sample.csv")

#df = df.sample(10000)
print(df.shape)
df.head()

(3659697, 14)


Unnamed: 0,title,url,text,dead,by,score,time,type,id,parent,descendants,ranking,deleted,timestamp
0,,,&gt;<i>which leads me to say why are you using...,,coldtea,,1390844000.0,comment,7131680,7127578.0,,,,2014-01-27T17:31:13Z
1,,,I would like to point out some counter-example...,,etanol,,1319396000.0,comment,3146879,3145330.0,,,,2011-10-23T18:46:40Z
2,,,,,,,1456641000.0,comment,11190089,11189361.0,,,True,2016-02-28T06:26:56Z
3,,,<i>Our msbuild implementation can now build Pr...,,Locke1689,,1407882000.0,comment,8170491,8170071.0,,,,2014-08-12T22:13:10Z
4,,,No matter how awful iPhoto is it's still bette...,,miloshadzic,,1362573000.0,comment,5330773,5327590.0,,,,2013-03-06T12:28:02Z


In [3]:
text = df["text"]
text = text[text.isnull() == False].values
len(text)

2984974

In [4]:
sentences = []
for sentence in tqdm(text):
    words = [i.lower() for i in sentence.split()]
    sentences.extend(words)

100%|██████████| 2984974/2984974 [00:36<00:00, 82163.98it/s]


In [5]:
unigram_counts = {}
for word in tqdm(sentences):
    if word not in unigram_counts.keys():
        unigram_counts[word] = 1
    else:
        unigram_counts[word] += 1

100%|██████████| 181603054/181603054 [02:07<00:00, 1425363.09it/s]


In [6]:
nb_skip = 2
skipgram_count = {}
for idx in tqdm(range(len(sentences) - nb_skip)):
    w1 = sentences[idx]
    w2 = sentences[idx+nb_skip]
    skip_pair = " ".join(sorted([w1, w2]))
    if skip_pair not in skipgram_count.keys():
        skipgram_count[skip_pair] = 1
    else:
        skipgram_count[skip_pair] += 1

100%|██████████| 181603052/181603052 [05:45<00:00, 525645.30it/s]


In [7]:
n_text = len(sentences)

def calc_pmi(word1, word2):
    bind_w = " ".join([word1, word2])
    pxy = skipgram_count[bind_w] / n_text
    px = unigram_counts[word1] / n_text
    py = unigram_counts[word2] / n_text
    pmi = math.log2(pxy / (px * py))
    return pmi

In [8]:
calc_pmi("the", "twitter")

0.3826487152819245

In [9]:
calc_pmi("and", "the")

1.323790313780776

In [10]:
min_count = 3

def min_count_filter(dic, count):
    tmp = {}
    for k, v in tqdm(dic.items()):
        if v < min_count:
            continue
        tmp[k] = v
    return tmp

In [11]:
unigram_counts = min_count_filter(unigram_counts, min_count)
skipgram_count = min_count_filter(skipgram_count, min_count)

100%|██████████| 4892981/4892981 [00:09<00:00, 504417.37it/s]
100%|██████████| 36607403/36607403 [00:13<00:00, 2699173.46it/s]


In [12]:
pmi_lst = []
for k, v in tqdm(unigram_counts.items()):
    _lst = []
    for kk, vv in unigram_counts.items():
        try:
            pmi = calc_pmi(k, kk)
        except KeyError:
            pmi = 0
        _lst.append(pmi)
    pmi_lst.append(_lst)
pmi_arr = np.array(pmi_lst)
pmi_arr

  0%|          | 99/644802 [01:10<131:07:29,  1.37it/s]

KeyboardInterrupt: 

In [None]:
U, S, V = svds(pmi_arr, k=256) 

In [None]:
w_vec_dic = {}
for v, w in zip(U, unigram_counts.keys()):
    w_vec_dic[w] = v

In [None]:
similarities = np.dot(U, w_vec_dic["facebook"])
[list(unigram_counts.keys())[i] for i in np.argsort(similarities)[:3]]

In [None]:
similarities = np.dot(U, w_vec_dic["haskell"])
[list(unigram_counts.keys())[i] for i in np.argsort(similarities)[:3]]