# K-Means Clustering Evaluation

This Python notebook is used for evaluation of a dictionary that is produced by:

- Find the cluster a word belongs to 
- Find the other words in the cluster for a specific word
- Compare between clusters

In [1]:
import pickle

# Specify the files
FILE_DICT_250 = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/dict_250C.pk"
FILE_CLUS_250 = "C:/Users/MyPC/Desktop/Vegito/K-Means Models/full_250C.pk"

FILE_DICT_500 = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/dict_500C.pk"
FILE_CLUS_500 = "C:/Users/MyPC/Desktop/Vegito/K-Means Models/full_500C.pk"

# Load using pickle
array_dict_cluster_250 = pickle.load(open(FILE_DICT_250, "rb"))
word_centroid_map_250 =  pickle.load(open(FILE_CLUS_250,"rb"))

array_dict_cluster_500 = pickle.load(open(FILE_DICT_500, "rb"))
word_centroid_map_500 =  pickle.load(open(FILE_CLUS_500,"rb"))

total_clusters_250 = max(word_centroid_map_250.values()) + 1
total_clusters_500 = max(word_centroid_map_500.values()) + 1

average_word_250 = round(len(word_centroid_map_250)/total_clusters_250)
average_word_500 = round(len(word_centroid_map_500)/total_clusters_500)

# Display results
print("TOTAL WORDS: %i \n" % (len(word_centroid_map_250)))

print("AVERAGE PER CLUSTER (250): %i" % (average_word_250))
print("AVERAGE PER CLUSTER (500): %i" % (average_word_500))

TOTAL WORDS: 1146604 

AVERAGE PER CLUSTER (250): 4586
AVERAGE PER CLUSTER (500): 2293


In [2]:
# Find the cluster of words, based on a given word
search = "scumbag"

# Get the key, or cluster number
# NOTE: Different clusters can have same results
cluster_num_250 = word_centroid_map_250[search]
cluster_num_500 = word_centroid_map_500[search]

# Return the array based on the cluster number
words_250 = array_dict_cluster_250[cluster_num_250]['word_list']
words_500 = array_dict_cluster_500[cluster_num_500]['word_list']

# Display results
print("SEARCHED WORD: %s \n" % (search))

print("TOTAL WORDS (250): %i" % (len(words_250)))
print("TOTAL WORDS (500): %i \n" % (len(words_500)))

print("WORDS (250): ", words_250[:200], "\n\n")
print("WORDS (500): ", words_500[:200])

SEARCHED WORD: scumbag 

TOTAL WORDS (250): 3385
TOTAL WORDS (500): 1148 

WORDS (250):  ['creepster', 'spasticated', 'poseur', 'barrista', 'lyncher', 'assbag', 'douce', 'asahole', 'bitchboy', 'plebeian', 'indulger', 'nutzo', 'schooler', 'sterotype', 'turbonerd', 'showbusiness', 'astroturfer', 'hosebeast', 'fop', 'traitorous', 'trifflin', 'nooblet', 'shithawk', 'shmoe', 'mongoloid', 'interneter', 'ungratefull', 'joykill', 'edumacated', 'toity', 'struggler', 'ludite', 'coldblooded', 'crossfiter', 'scheister', 'softhearted', 'insufferable', 'salty', 'bandwagonner', 'nigged', 'comie', 'plebian', 'bogan', 'mastermind', 'whingey', 'punker', 'scumhole', 'pigkin', 'slaphead', 'schlubby', 'peson', 'bollocking', 'runescaper', 'dindunuffin', 'lapdog', 'fuckpipe', 'meatbag', 'smarmy', 'douchie', 'whore', 'trekkie', 'cockface', 'milquetoast', 'cuntself', 'kneckbeard', 'fag', 'rapie', 'bratty', 'schitck', 'bumptious', 'telemarketer', 'tarded', 'cornball', 'hardon', 'mewling', 'buttheaded', 'scumlor

In [3]:
# Perform two types of clustering: K-Means and Mini Batch K Means

from gensim.models import Word2Vec as w2v
from sklearn.cluster import KMeans, MiniBatchKMeans

import numpy as np
import time



In [4]:
# Load the Word2Vec model
print("LOADING WORD2VEC MODEL \n\n")
FILE = "C:/Users/MyPC/Desktop/Vegito/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

LOADING WORD2VEC MODEL 




In [6]:
WORDS = 100000

# Get the word vectors and words
print("GETTING WORD VECTORS AND WORDS")
word_vectors = model.syn0[:WORDS]
words = model.index2word[:WORDS]

# Perform Minibatch K-Means clustering
# Use 250 Clusters
CLUSTERS = 2000
k_means = MiniBatchKMeans(n_clusters = CLUSTERS)

# Fit the model, get the centroid number and calculate time
print("TRAINING K-MEANS WITH %i CLUSTERS \n\n" % (CLUSTERS))
start = time.time()
idx = k_means.fit_predict(word_vectors)
end = time.time()

print('TIME TAKEN', end-start)

# Store it in a dictionary
print('STORING IN DICTIONARY')
word_centroid_map = dict(zip(words,idx))

GETTING WORD VECTORS AND WORDS
TRAINING K-MEANS WITH 2000 CLUSTERS 




  init_size=init_size)


TIME TAKEN 284.44727635383606
STORING IN DICTIONARY


In [17]:
# Test it out
word = 'genre'

# Get cluster number
cluster = word_centroid_map[word]

# Append for words in same cluster
word_list = [ word for word, cluster_num in word_centroid_map.items() if cluster == cluster_num ]

print('CLUSTER NUMBER: %i' % (cluster))
print('NUMBER OF WORDS: %i \n' % (len(word_list)))
print('WORDS: ' ,word_list)

CLUSTER NUMBER: 522
NUMBER OF WORDS: 112 

WORDS:  ['acapellas', 'grime', 'beatport', 'brostep', 'trappy', 'screamo', 'melodic', 'soundscapes', 'britpop', 'neurofunk', 'electronica', 'breakcore', 'hop', 'oldies', 'nasheeds', 'bap', 'indie', 'idm', 'groovy', 'rnb', 'electronic', 'trance', 'bhangra', 'genres', 'punk', 'psybient', 'dancey', 'jangle', 'bigroom', 'djs', 'subgenre', 'bossa', 'ska', 'djent', 'rockabilly', 'bachata', 'genre', 'eclectic', 'prog', 'psychadelic', 'lofi', 'remixes', 'bluegrass', 'danceable', 'ragga', 'synthpop', 'gabber', 'edm', 'complextro', 'avant', 'psytrance', 'chiptune', 'dubstep', 'funk', 'synthwave', 'nightcore', 'chillwave', 'techno', 'shoegaze', 'synthy', 'hip', 'thrash', 'dnb', 'subgenres', 'mashups', 'grunge', 'remixing', 'moombahton', 'deathcore', 'monstercat', 'jpop', 'chillstep', 'reggae', 'music', 'motown', 'hardstyle', 'dancehall', 'futurebeats', 'riddim', 'mathcore', 'grindcore', 'folky', 'hiphop', 'autotuned', 'psychedelia', 'instrumental', 'regg