# K-Means Clustering Evaluation

This Python notebook is used for evaluation of a dictionary that is produced by:

- Find the cluster a word belongs to 
- Find the other words in the cluster for a specific word
- Compare between clusters

In [1]:
import pickle

# Specify the files
FILE_DICT_250 = "C:/Users/MyPC/Desktop/FYP/Word Dictionaries/dict_250C.pk"
FILE_CLUS_250 = "C:/Users/MyPC/Desktop/FYP/K-Means Models/full_250C.pk"

FILE_DICT_500 = "C:/Users/MyPC/Desktop/FYP/Word Dictionaries/dict_500C.pk"
FILE_CLUS_500 = "C:/Users/MyPC/Desktop/FYP/K-Means Models/full_500C.pk"

# Load using pickle
array_dict_cluster_250 = pickle.load(open(FILE_DICT_250, "rb"))
word_centroid_map_250 =  pickle.load(open(FILE_CLUS_250,"rb"))

array_dict_cluster_500 = pickle.load(open(FILE_DICT_500, "rb"))
word_centroid_map_500 =  pickle.load(open(FILE_CLUS_500,"rb"))

total_clusters_250 = max(word_centroid_map_250.values()) + 1
total_clusters_500 = max(word_centroid_map_500.values()) + 1

average_word_250 = round(len(word_centroid_map_250)/total_clusters_250)
average_word_500 = round(len(word_centroid_map_500)/total_clusters_500)

# Display results
print("TOTAL WORDS: %i \n" % (len(word_centroid_map_250)))

print("AVERAGE PER CLUSTER (250): %i" % (average_word_250))
print("AVERAGE PER CLUSTER (500): %i" % (average_word_500))

TOTAL WORDS: 1146604 

AVERAGE PER CLUSTER (250): 4586
AVERAGE PER CLUSTER (500): 2293


In [32]:
# Find the cluster of words, based on a given word
search = "gives"

# Get the key, or cluster number
# NOTE: Different clusters can have same results
cluster_num_250 = word_centroid_map_250[search]
cluster_num_500 = word_centroid_map_500[search]

# Return the array based on the cluster number
words_250 = array_dict_cluster_250[cluster_num_250]['word_list']
words_500 = array_dict_cluster_500[cluster_num_500]['word_list']

# Display results
print("SEARCHED WORD: %s \n" % (search))

print("TOTAL WORDS (250): %i" % (len(words_250)))
print("TOTAL WORDS (500): %i \n" % (len(words_500)))

print("WORDS (250): ", words_250[:200], "\n\n")
print("WORDS (500): ", words_500[:200])

SEARCHED WORD: gives 

TOTAL WORDS (250): 1817
TOTAL WORDS (500): 1539 

WORDS (250):  ['transcended', 'obliges', 'instigates', 'consists', 'jizzes', 'overvalues', 'ownes', 'fantasizes', 'lessens', 'prepares', 'rests', 'sees', 'mistook', 'symbolises', 'narrows', 'reconfigures', 'recaptures', 'handwaves', 'undertakes', 'dumps', 'regroups', 'suceeds', 'vacillates', 'clobbers', 'inhales', 'determines', 'energizes', 'befuddles', 'consideres', 'dies', 'gushes', 'regurgitates', 'terminates', 'poaches', 'retreats', 'faints', 'slaughters', 'rouses', 'defaces', 'overthinks', 'unfreezes', 'reanimates', 'conserves', 'oozes', 'gropes', 'antagonises', 'evades', 'scares', 'trashes', 'skimps', 'explains', 'enters', 'swears', 'mantains', 'saps', 'debases', 'mismanages', 'entertains', 'whithers', 'tidies', 'hijacks', 'infers', 'foists', 'talks', 'corrects', 'recurs', 'camouflages', 'adheres', 'decries', 'dabbles', 'dispatches', 'showcases', 'mashes', 'thrives', 'loves', 'revitalizes', 'raises', 'naviga

In [40]:
SENTENCE = "go to hell you stupid maggot"

import numpy as np

# Pre-allocate the bag of centroids vector (for speed)
bag_of_centroids = np.zeros( total_clusters_500, dtype="float32" )

# Loop word by word
for word in SENTENCE.split():
    
    # Check if word is in dictionary
    if word in word_centroid_map_500:
        
        # Get index of the word
        index = word_centroid_map_500[word]
        
        # Print for evalution
        print(word, index)
        
        # Increment index of bag_of_centroids
        bag_of_centroids[index] += 1

bag_of_centroids

go 378
to 378
hell 378
you 119
stupid 122
maggot 373


array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0