In [1]:
import gensim
import scipy.spatial.distance as dist
import numpy as np
import math
import pandas as pd

In [2]:
from manager import DatasetManager

In [3]:
from datetime import datetime

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('/home/tadas/GoogleNews-vectors-negative300.bin',
                                                        binary=True)

In [3]:
df = pd.read_csv('/home/tadas/words_df.csv')
df[0] = df["0"]
del df['Unnamed: 0'], df["0"]

In [6]:
def datapoints():
    for word in df[0]:
        yield model[word]

In [7]:
def arccosdist(vect1, vect2):
    if (vect1 == vect2).all():
        return 0
    return math.degrees(np.arccos(1 - dist.cosine(vect1, vect2)))

In [36]:
print(datetime.now())
manager = DatasetManager(vertex_iter=datapoints(),
                         centers_num=lambda x: int(math.sqrt(x)),
                         distance_funct=arccosdist,
                         epsilon=77)
manager.get_centers_ready()
print(datetime.now())
worker = manager.calulate_homologies()
print(datetime.now())
manager.cluster(report_homologies=True)
print(datetime.now())

2018-01-17 18:23:10.024470
2018-01-17 18:23:10.154529


KeyboardInterrupt: 

In [31]:
new_clusters = []                               
for cluster in manager.clusters:  
    cluster, hom = cluster                                                                  
    new_cluster = [df[0][x] for x in cluster]
    new_clusters.append((new_cluster, hom[:10]))

In [32]:
new_clusters

[(['group'], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['repository'], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['trading',
   'trade',
   'manufacturing',
   'warehouse',
   'factory',
   'mart',
   'emporium',
   'outfit',
   'shop',
   'outlet',
   'foundry',
   'drugstore',
   'bottle',
   'thrift',
   'cache'],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['embankment',
   'ledge',
   'incline',
   'cliff',
   'reef',
   'levee',
   'lakeshore',
   'beach',
   'oceanfront',
   'lakefront',
   'seafront',
   'coast',
   'seaboard',
   'lakeside',
   'riverfront',
   'heap',
   'rank',
   'amass',
   'disperse',
   'dissipate',
   'accumulate',
   'use',
   'hide',
   'uncover',
   'save',
   'mound',
   'pitch'],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['sequence'], [0, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['workshop'], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['firm'], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 (['stash'], [0, 2, 1, 0, 0, 0, 0, 0, 0, 0]),
 (['setup'], [0, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
 (['savings', 'bank'], [

In [7]:
np.argwhere(df[0] == 'repository')

array([[1]])

In [16]:
x = np.array([[0,1,2,3], [5,6,0,0]])
len(np.argwhere(x == 10))

0

In [4]:
df[0][68]

'work'

In [19]:
for index, vector in enumerate(manager.vertices):
    if arccosdist(vector, manager.vertices[154]) <= 70:
        print(index, df[0][index])

11 sweatshop
15 showroom
22 laboratory
43 shop
66 store
78 mill
108 depot
126 manufacturing
140 manufactory
141 warehouse
147 foundry
154 factory


In [33]:
for edge, homology in worker.edge_homologies.items():
    if 154 in edge:
        print(edge, homology)

frozenset({154, 141}) [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
frozenset({154, 93}) [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [35]:
df[0][71]

'storehouse'

In [34]:
for vertex, homology in worker.vertex_homologies.items():
    print(vertex, np.argwhere(np.array(homology) != 0).flatten())

0 [1]
1 [1]
2 []
3 []
4 [1]
5 [1]
6 [2]
7 [1 2]
8 [1]
9 [2]
10 []
11 [1]
12 []
13 [3]
14 []
15 [1]
16 [1]
17 [1]
18 [1]
19 [2]
20 [1]
21 [2]
22 [1]
23 [2]
24 [1]
25 [2]
26 []
27 []
28 [1]
29 [1]
30 [1]
31 []
32 [1]
33 [1]
34 [1]
35 []
36 [1 2]
37 []
38 []
39 [1]
40 [1]
41 [1]
42 [1]
43 []
44 []
45 []
46 []
47 [2]
48 [0]
49 []
50 []
51 [1]
52 []
53 [2]
54 [1]
55 [1 2]
56 [1]
57 []
58 [1]
59 []
60 []
61 [1]
62 [1]
63 []
64 []
65 [1]
66 [2]
67 []
68 [1]
69 [1]
70 []
71 [3]
72 [1]
73 [1]
74 [0]
75 [1]
76 [2]
77 []
78 [1]
79 [2]
80 [0]
81 [1]
82 [3]
83 [2]
84 [1]
85 [1]
86 []
87 []
88 [1]
89 []
90 [1]
91 []
92 []
93 [1]
94 [1]
95 [1]
96 []
97 []
98 [1]
99 [1]
100 []
101 [2]
102 []
103 [1]
104 []
105 []
106 [1]
107 [1]
108 [2]
109 [1]
110 [1 2]
111 [1]
112 [1]
113 [1]
114 []
115 []
116 [2]
117 []
118 [2]
119 [2]
120 []
121 [2]
122 []
123 [1]
124 [1]
125 []
126 []
127 [2]
128 []
129 [1 2]
130 [1]
131 []
132 []
133 []
134 [1]
135 [1]
136 [1]
137 [2]
138 [1]
139 [1]
140 [2]
141 []
142 [1]
143 [

In [2]:
df = pd.read_csv('/home/tadas/words_df.csv')
df[0] = df["0"]
del df['Unnamed: 0'], df["0"]

In [3]:
df[0][43]

'shop'