In [1]:
import sys
from time import time

from gensim.models.wrappers import FastText

In [2]:
category_list = [
    'Person', 'Location', 'Event', 'Organization', 'DateTime', 'PersonType', 'Currency', 'Nationality', 'Thing'
]

In [3]:
t = time()
# Load pre trained model
w2v_model = FastText.load_fasttext_format(model_file='models/fasttext/cc.tr.300.bin')

print('Time to load the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to load the model: 4.08 mins


In [9]:
import pandas as pd

t = time()

category_vectors = {}
for category in category_list:
    category_vectors[category] = []
    
# Read labeled words
df = pd.read_excel('data/labeled_words_single_v4.xlsx')

counter = 0
hit_word_count = 0
# Read Excel file and fill the dictionary
for row in df.index:
    word = str(df['Word'][row])
    label = df['Label'][row]
    
    if word in w2v_model.wv:
        hit_word_count += 1
        category_vectors[label].append(w2v_model.wv[word])
    
    counter += 1
    sys.stdout.write('\r'+str(counter))

print('Time to create vector array: {} mins'.format(round((time() - t) / 60, 2)))

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277

93765Time to create vector array: 0.34 mins


In [10]:
# print(category_vectors['Person'][0])
print('Missed ' + str(counter - hit_word_count) + ' from ' + str(counter))

Missed 490 from 93765


In [14]:
import numpy as np

t = time()

centroids = {}

for category in category_list:
    centroids[category] = np.sum(category_vectors[category], axis=0) / len(category_vectors[category])
    
print('Time to calculate centroids: {} mins'.format(round((time() - t) / 60, 2)))

Time to calculate centroids: 0.0 mins


In [16]:
#print(category_vectors['Person'][0])  # -5.33770142e+02  --- -9.28570423e-03
#print(len(category_vectors['Person']))
#print(centroids['Person'])

In [17]:
import pandas as pd

from numpy import dot
from numpy.linalg import norm

t = time()

# features will be stored in this dataframe
output_df = pd.DataFrame()

# Read labeled words
df = pd.read_excel('data/labeled_words_single_v4.xlsx')

counter = 0

# Read Excel file and fill the dictionary
for row in df.index:
    word = str(df['Word'][row])
    label = str(df['Label'][row])

    if word in w2v_model.wv:
        
        output = { 'Word': word, 'Label': label  }
        
        # calculate distance to each centroid
        for category in category_list:
            output[category] = dot(w2v_model.wv[word], centroids[category])/(norm(w2v_model.wv[word])*norm(centroids[category]))
            #output[category] = 1 - spatial.distance.cosine(w2v_model.wv[word],centroids[category])
            
        output_df = output_df.append(output, ignore_index=True)
        
    counter += 1
    #if counter == 1000:
        #break
    sys.stdout.write('\r'+str(counter))

# df["Person"] = pd.to_numeric(df["Person"])
print('Time to calculate centroid distances categories: {} mins'.format(round((time() - t) / 60, 2)))
output_df.head(5)
# print('Missed ' + str(counter - hit_word_count) + ' from ' + str(counter))

93765Time to calculate centroid distances categories: 42.28 mins


Unnamed: 0,Currency,DateTime,Event,Label,Location,Nationality,Organization,Person,PersonType,Thing,Word
0,0.250148,0.122472,0.338534,Person,0.560342,0.338208,0.474824,0.652741,0.333379,0.365368,Corina
1,0.162727,0.191302,0.288349,Person,0.351361,0.222733,0.320877,0.431188,0.316707,0.320074,Casanova
2,0.258475,0.158841,0.253603,Location,0.217465,0.160455,0.288744,0.165113,0.136111,0.167267,İsviçre
3,0.205449,0.221836,0.289384,PersonType,0.288628,0.253223,0.291755,0.256634,0.286041,0.322561,Şansölyesidir
4,0.172876,0.070771,0.125732,PersonType,0.175117,0.220059,0.19262,0.253646,0.467836,0.173848,avukat


In [18]:
t = time()

writer = pd.ExcelWriter('data/centroid_distances_v4.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
output_df.to_excel(writer, sheet_name='Sheet1', index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()
print('Time to save: {} mins'.format(round((time() - t) / 60, 4)))

Time to save: 0.5978 mins


In [32]:
output_df.loc[899]

Currency         0.226872
DateTime        0.0928929
Event            0.314503
Label              Person
Location         0.401173
Nationality      0.264712
Organization     0.345497
Person           0.446384
PersonType       0.274834
Thing            0.323062
Word               Kruger
Name: 899, dtype: object