In [4]:
import numpy as np
import pandas as pd
import os

from sklearn.cluster import DBSCAN, KMeans
from wb_nlp import dir_manager
import umap

In [5]:
from gensim.models import Word2Vec

In [6]:
w2v = Word2Vec.load(os.path.join(dir_manager.get_model_dir('WORD2VEC'), 'wb-w2vec_ALL_50.mm'))

In [27]:
kv = w2v.wv.vocab['java']

In [29]:
w2v.wv.get_vector('java')

array([ 0.19520357,  0.04155749, -0.20089565,  0.05111561, -0.30242702,
        0.88110745,  0.42764652, -0.30591288, -0.00456821,  0.11168303,
       -0.10363951, -0.44124514,  0.18431583, -0.07108448,  0.02758358,
       -0.16658421, -0.37262014,  0.34898758,  0.5199006 ,  0.54547346,
        0.02725955,  0.20501535, -0.09207716, -0.068601  ,  0.19639523,
        0.4056639 , -0.5512555 , -0.64817333, -0.18392295,  0.07174532,
       -0.37544206,  0.83449966,  0.4029414 ,  0.1631779 ,  0.5051035 ,
       -0.47664785,  0.06698374,  0.26855966,  0.29887244, -0.22791019,
       -0.02170135,  0.05202591,  0.6791098 ,  0.9394384 ,  0.10043009,
       -0.3822091 , -0.10048459,  0.19485329,  0.18161002, -0.1359897 ],
      dtype=float32)

In [34]:
words = list(w2v.wv.vocab.keys())

In [35]:
vecs = w2v.wv[words]

In [147]:
counts = [w2v.wv.vocab[i].count for i in words]

In [103]:
np.random.seed(1029)
init_vec = np.random.random((vecs.shape[0], 3))
reducer = umap.UMAP(n_components=3, n_neighbors=5, repulsion_strength=2, negative_sample_rate=15, init=init_vec, metric='cosine', random_state=1029)

In [104]:
%%time
reducer.fit(vecs)

CPU times: user 49.5 s, sys: 550 ms, total: 50.1 s
Wall time: 50 s


UMAP(angular_rp_forest=True,
     init=array([[0.04103986, 0.40976958, 0.58871416],
       [0.89993329, 0.12111492, 0.17019841],
       [0.94016805, 0.59530409, 0.33567926],
       ...,
       [0.83521572, 0.05137494, 0.86799723],
       [0.20083059, 0.96858091, 0.54904675],
       [0.19816467, 0.73511673, 0.22283513]]),
     metric='cosine', n_components=3, n_neighbors=5, negative_sample_rate=15,
     random_state=1029, repulsion_strength=2)

In [105]:
reduced_vecs = reducer.transform(vecs)

In [106]:
reduced_vecs

array([[15.767052 ,  1.4806361,  4.306887 ],
       [11.228562 , -3.6929073,  5.6114264],
       [11.259015 , -3.704559 ,  5.4633465],
       ...,
       [ 6.966355 , 10.104829 ,  4.9372225],
       [-1.4541839,  6.2937803,  6.9871397],
       [-1.8074714,  6.267576 ,  6.4316573]], dtype=float32)

In [121]:
clusterer = DBSCAN(n_jobs=6, eps=0.5, min_samples=100)
# clusterer = KMeans(n_clusters=20, random_state=1029)

In [122]:
clusterer.fit(reduced_vecs)

DBSCAN(min_samples=100, n_jobs=6)

In [123]:
from collections import Counter

In [125]:
cluster_count = pd.DataFrame(Counter(clusterer.labels_).most_common(), columns=['cluster', 'count']).sort_values('count')
cluster_count.head()

Unnamed: 0,cluster,count
16,13,101
15,15,102
14,5,103
13,12,111
12,7,112


In [127]:
cluster_count

Unnamed: 0,cluster,count
16,13,101
15,15,102
14,5,103
13,12,111
12,7,112
11,9,137
10,14,143
9,2,143
8,0,183
7,3,209


In [128]:
cluster_count['count'].cumsum()

16      101
15      203
14      306
13      417
12      529
11      666
10      809
9       952
8      1135
7      1344
6      1596
5      1879
4      2187
3      2599
2      3133
1      3804
0     40092
Name: count, dtype: int64

In [129]:
clusterer.labels_[:10]

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [130]:
words[:10]

['policy',
 'research',
 'working_paper',
 'crisis',
 'capital',
 'control',
 'financial',
 'integration',
 'levy',
 'van']

In [148]:
vecs_df = pd.DataFrame(reduced_vecs, columns=['x', 'y', 'z'])
meta_df = pd.DataFrame()
meta_df['word'] = words
meta_df['cluster'] = clusterer.labels_
meta_df['count'] = counts

In [149]:
vecs_df = vecs_df - vecs_df.mean()

In [150]:
vecs_df = pd.concat([vecs_df, meta_df], axis=1)

In [153]:
# vecs_df.sample(n=2000, random_state=1029)
undersample = [-1]
cand_vecs = vecs_df[~vecs_df['cluster'].isin(undersample)]
sample_vecs = vecs_df[vecs_df['cluster'].isin(undersample)].sort_values('count', ascending=False).head(1000)  # .sample(n=200, random_state=1029)

cand_vecs = pd.concat([cand_vecs, sample_vecs])

cand_vecs.to_csv(dir_manager.get_path_from_root('app', 'app_vue2', 'public', 'static', 'data', 'w2v_vecs.csv'), index=None)

In [143]:
cand_vecs

Unnamed: 0,x,y,z,word,cluster
66,8.609283,1.049061,-6.849031,econ,5
370,-4.090233,-0.292543,-7.632594,linear,0
371,-4.052517,-0.165393,-7.535110,non_linear,0
510,-3.695083,-0.266976,-7.850393,observation,0
524,-3.859502,-0.417105,-8.079831,standard_deviation,0
...,...,...,...,...,...
10578,8.333376,-0.156833,1.680997,stance,-1
3259,3.600918,-6.073382,-3.463656,cache,-1
40090,-6.589330,1.117482,1.907287,spitefully,-1
14101,-0.421665,2.408006,-5.514859,scribe,-1
