# K-Means Clustering Evaluation

This Python notebook is used for evaluation of a dictionary that is produced by:

- Find the cluster a word belongs to 
- Find the other words in the cluster for a specific word
- Compare between clusters

In [1]:
import pickle

# Specify the files
FILE_DICT_250 = "C:/Users/MyPC/Desktop/FYP/Word Dictionaries/dict_250C.pk"
FILE_CLUS_250 = "C:/Users/MyPC/Desktop/FYP/K-Means Models/full_250C.pk"

FILE_DICT_500 = "C:/Users/MyPC/Desktop/FYP/Word Dictionaries/dict_500C.pk"
FILE_CLUS_500 = "C:/Users/MyPC/Desktop/FYP/K-Means Models/full_500C.pk"

# Load using pickle
array_dict_cluster_250 = pickle.load(open(FILE_DICT_250, "rb"))
word_centroid_map_250 =  pickle.load(open(FILE_CLUS_250,"rb"))

array_dict_cluster_500 = pickle.load(open(FILE_DICT_500, "rb"))
word_centroid_map_500 =  pickle.load(open(FILE_CLUS_500,"rb"))

total_clusters_250 = max(word_centroid_map_250.values()) + 1
total_clusters_500 = max(word_centroid_map_500.values()) + 1

average_word_250 = round(len(word_centroid_map_250)/total_clusters_250)
average_word_500 = round(len(word_centroid_map_500)/total_clusters_500)

# Display results
print("TOTAL WORDS: %i \n" % (len(word_centroid_map_250)))

print("AVERAGE PER CLUSTER (250): %i" % (average_word_250))
print("AVERAGE PER CLUSTER (500): %i" % (average_word_500))

TOTAL WORDS: 1146604 

AVERAGE PER CLUSTER (250): 4586
AVERAGE PER CLUSTER (500): 2293


In [2]:
# Find the cluster of words, based on a given word
search = "scumbag"

# Get the key, or cluster number
# NOTE: Different clusters can have same results
cluster_num_250 = word_centroid_map_250[search]
cluster_num_500 = word_centroid_map_500[search]

# Return the array based on the cluster number
words_250 = array_dict_cluster_250[cluster_num_250]['word_list']
words_500 = array_dict_cluster_500[cluster_num_500]['word_list']

# Display results
print("SEARCHED WORD: %s \n" % (search))

print("TOTAL WORDS (250): %i" % (len(words_250)))
print("TOTAL WORDS (500): %i \n" % (len(words_500)))

print("WORDS (250): ", words_250[:200], "\n\n")
print("WORDS (500): ", words_500[:200])

SEARCHED WORD: scumbag 

TOTAL WORDS (250): 3385
TOTAL WORDS (500): 1148 

WORDS (250):  ['creepster', 'spasticated', 'poseur', 'barrista', 'lyncher', 'assbag', 'douce', 'asahole', 'bitchboy', 'plebeian', 'indulger', 'nutzo', 'schooler', 'sterotype', 'turbonerd', 'showbusiness', 'astroturfer', 'hosebeast', 'fop', 'traitorous', 'trifflin', 'nooblet', 'shithawk', 'shmoe', 'mongoloid', 'interneter', 'ungratefull', 'joykill', 'edumacated', 'toity', 'struggler', 'ludite', 'coldblooded', 'crossfiter', 'scheister', 'softhearted', 'insufferable', 'salty', 'bandwagonner', 'nigged', 'comie', 'plebian', 'bogan', 'mastermind', 'whingey', 'punker', 'scumhole', 'pigkin', 'slaphead', 'schlubby', 'peson', 'bollocking', 'runescaper', 'dindunuffin', 'lapdog', 'fuckpipe', 'meatbag', 'smarmy', 'douchie', 'whore', 'trekkie', 'cockface', 'milquetoast', 'cuntself', 'kneckbeard', 'fag', 'rapie', 'bratty', 'schitck', 'bumptious', 'telemarketer', 'tarded', 'cornball', 'hardon', 'mewling', 'buttheaded', 'scumlor

In [None]:
# Perform PCA on the word vectors before Clustering with K-Means
# Reason: Too much computations crashed the workstation along with memory constrains
# Idea: By reducing the 300-dimensions to N-dimension by PCA, perform clustering
# NOTE: Find the total variation from N-dimensions

