In [1]:
"""
Part 3: 
    Objective: Calculate category centroids and measure cosine similarities of words to each category centroid.
"""
import sys
from time import time

from gensim.models.wrappers import FastText

In [2]:
category_list = [
    'Person', 'Location', 'Event', 'Organization', 'DateTime', 'PersonType', 'Currency', 'Nationality', 'Ethnicity', 'Thing'
]

In [3]:
t = time()
# Load pre trained model
w2v_model = FastText.load_fasttext_format(model_file='models/fasttext/cc.tr.300.bin')

print('Time to load the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to load the model: 4.3 mins


In [6]:
import pandas as pd

t = time()

category_vectors = {}
for category in category_list:
    category_vectors[category] = []
    
# Read labeled words
df = pd.read_excel('data/labeled_words_single_v3.xlsx')

counter = 0
hit_word_count = 0
# Read Excel file and fill the dictionary
for row in df.index:
    word = str(df['Word'][row])
    label = df['Label'][row]
    
    if word in w2v_model.wv:
        hit_word_count += 1
        category_vectors[label].append(w2v_model.wv[word])
    
    counter += 1
    sys.stdout.write('\r'+str(counter))

print('Time to vectorize categories: {} mins'.format(round((time() - t) / 60, 2)))

132188Time to vectorize categories: 0.49 mins


In [7]:
#print(category_vectors['Person'])
print('Missed ' + str(counter - hit_word_count) + ' from ' + str(counter))

Missed 1738 from 132188


In [8]:
import numpy as np

t = time()

centroids = {}
#for category in category_list:
#    centroids[category] = [] 

for category in category_list:
    centroids[category] = np.sum(category_vectors[category], axis=0) / len(category_vectors[category])
    
print('Time to calculate centroids: {} mins'.format(round((time() - t) / 60, 2)))

Time to calculate centroids: 0.01 mins


In [9]:
#print(category_vectors['Person'][0])  # -5.33770142e+02  --- -9.28570423e-03
#print(len(category_vectors['Person']))
#print(centroids['Person'])

In [10]:
import pandas as pd

from numpy import dot
from numpy.linalg import norm

t = time()

# features will be stored in this dataframe
output_df = pd.DataFrame()

# Read labeled words
df = pd.read_excel('data/labeled_words_single_v3.xlsx')

counter = 0


# Read Excel file and fill the dictionary
for row in df.index:
    word = str(df['Word'][row])
    label = str(df['Label'][row])

    if word in w2v_model.wv:
        
        output = { 'Word': word, 'Label': label  }
        
        # calculate distance to each centroid
        for category in category_list:
            output[category] = dot(w2v_model.wv[word], centroids[category])/(norm(w2v_model.wv[word])*norm(centroids[category]))
            #output[category] = 1 - spatial.distance.cosine(w2v_model.wv[word],centroids[category])
            
        
        output_df = output_df.append(output, ignore_index=True)
        
    counter += 1
    #if counter == 1000:
        #break
    sys.stdout.write('\r'+str(counter))

# df["Person"] = pd.to_numeric(df["Person"])
print('Time to calculate features categories: {} mins'.format(round((time() - t) / 60, 2)))
output_df.head(5)
# print('Missed ' + str(counter - hit_word_count) + ' from ' + str(counter))

132188Time to calculate features categories: 86.47 mins


Unnamed: 0,Currency,DateTime,Ethnicity,Event,Label,Location,Nationality,Organization,Person,PersonType,Thing,Word
0,0.273349,0.110858,0.360196,0.347695,Person,0.511219,0.338397,0.473211,0.620924,0.335272,0.376251,Corina
1,0.186871,0.174768,0.238327,0.291291,Person,0.329679,0.221148,0.323822,0.419146,0.315501,0.323541,Casanova
2,0.203083,0.050742,0.076906,0.177083,Location,0.098033,0.140946,0.174271,0.097719,0.122597,0.129025,İsviçre
3,0.220789,0.272169,0.279574,0.322181,PersonType,0.306717,0.248588,0.309686,0.287089,0.291445,0.334109,Şansölyesidir
4,0.208029,0.079137,0.218094,0.160399,PersonType,0.218144,0.227898,0.228069,0.279599,0.466141,0.193471,avukat


In [11]:
t = time()

writer = pd.ExcelWriter('data/centroid_distances_v3.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
output_df.to_excel(writer, sheet_name='Sheet1', index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()
print('Time to save: {} mins'.format(round((time() - t) / 60, 4)))

Time to save: 0.925 mins


In [17]:
output_df.loc[183]

Currency             0.259803
DateTime             0.406465
Ethnicity            0.407744
Event                0.445915
Label                Location
Location             0.533138
Nationality          0.414596
Organization         0.478263
Person                0.39975
PersonType           0.402069
Thing                 0.38728
Word            Slovakya'daki
Name: 183, dtype: object

In [18]:
t = time()

df = pd.read_excel('data/featuresv3.xlsx')
features_df = pd.DataFrame()
counter = 0


# Read Excel file and fill the dictionary
for row in df.index:
    word = str(df['Word'][row])
    label = str(df['Label'][row])

    if word in w2v_model.wv:
        
        output = { 'Word': word, 'Label': label, 'Length':df['Length'][row], 'All Capital':df['All Capital'][row], 'Start Capital':df['Start Capital'][row], 'Contains Number':df['Contains Number'][row]  }
        
        features_df = features_df.append(output, ignore_index=True)
        
    counter += 1
    #if counter == 1000:
        #break
    sys.stdout.write('\r'+str(counter))

# df["Person"] = pd.to_numeric(df["Person"])
print('Time to filter out features: {} mins'.format(round((time() - t) / 60, 2)))
features_df.head(5)
# print('Missed ' + str(counter - hit_word_count) + ' from ' + str(counter))

132188Time to filter out features: 60.94 mins


Unnamed: 0,All Capital,Contains Number,Label,Length,Start Capital,Word
0,0.0,0.0,Person,6.0,1.0,Corina
1,0.0,0.0,Person,8.0,1.0,Casanova
2,0.0,0.0,Location,7.0,1.0,İsviçre
3,0.0,0.0,PersonType,13.0,1.0,Şansölyesidir
4,0.0,0.0,PersonType,6.0,0.0,avukat


In [19]:
t = time()

writer = pd.ExcelWriter('data/features_filtered.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
features_df.to_excel(writer, sheet_name='Sheet1', index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()
print('Time to save: {} mins'.format(round((time() - t) / 60, 4)))

Time to save: 0.4607 mins
