# Exploring representative tuples by clustering the embedding space

In [118]:
import warnings
warnings.filterwarnings('ignore')

import time
from sklearn.cluster import KMeans, Birch
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import numpy as np
import h5py

## Perform clustering and return the cluster centers

In [72]:
word2VecModelPath = 'amazonModelWord2Vec.w2v'
fastTextModelPath = 'amazonModelFastText.w2v'

Clustering with KMeans

In [117]:
def getClusterCentersWithKMeans(model, numberOfClusters):
    # Get the word vectors of the model
    word_vectors = model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("Number of words = {0}, vector size = {1}".format(n_words, vec_size))

    # Cluster using KMeans
    start = time.time()
    print("Clustering ... ", end="", flush=True)
    kmeans = KMeans(n_clusters=numberOfClusters, n_jobs=-1, random_state=0)
    idx = kmeans.fit_predict(word_vectors)
    print("Finished clustering in {:.2f} sec.".format(time.time() - start), flush=True)

    # Return cluster centers
    return kmeans.cluster_centers_

Birch Clustering

In [119]:
def getClusterCentersWithBirch(model, numberOfClusters):
    # Get the word vectors of the model
    word_vectors = model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("Number of words = {0}, vector size = {1}".format(n_words, vec_size))

    # Cluster using KMeans
    start = time.time()
    print("Clustering ... ", end="", flush=True)
    birch = Birch(n_clusters=numberOfClusters)
    idx = birch.fit_predict(word_vectors)
    print("Finished clustering in {:.2f} sec.".format(time.time() - start), flush=True)

    # Return cluster centers
    return birch.cluster_centers_

## Get the closest vector to each of the cluster centers
We'll pass the number of cluster centers as an argument. This can be thought of as a drill down equivalent.

Greater the number of cluster centers, more detailed will be the resulting results returned. 

Number of clusters chosen is 3 by default. This can be overriden, if needed.

In [123]:
def getClosestWordEmbedding(modelPath, numberOfClusters = 3):
    # Load the model
    start = time.time()
    model = KeyedVectors.load(modelPath)
    print("Finished loading model in {:.2f} sec.".format(time.time() - start), flush=True)
    
    clusterCenters = getClusterCentersWithKMeans(model, numberOfClusters)
    
    # Create an empty numpy array of size equal to cluster centers to store the closest words
    closestWords = []
    
    # Get the closest word for each of the cluster centers
    for clusterCenter in clusterCenters:
            closestWords.append(model.similar_by_vector(clusterCenter))
    
    return closestWords

In [124]:
getClosestWordEmbedding(word2VecModelPath)

Finished loading model in 0.14 sec.
Number of words = 18473, vector size = 100
Clustering ... Finished clustering in 3.60 sec.


[[('Pokémon Adventures: Diamond and Pearl/Platinum, Vol. 9',
   0.7363650798797607),
  ('July 09, 2000', 0.7359441518783569),
  ('Gorilla Adventure', 0.7359086275100708),
  ('June 04, 2001', 0.7354663610458374),
  ('Lion Adventure', 0.7346001863479614),
  ('Milda Harris', 0.7345806956291199),
  ('The Adventures of Danny Meadow Mouse', 0.7342305183410645),
  ('June 18, 2013', 0.7342035174369812),
  ('The Adventures of Buster Bear', 0.7334026098251343),
  ("Arthur's Computer Disaster: An Arthur Adventure", 0.7328624129295349)],
 [('April 30, 2008', 0.7729446887969971),
  ("Piratica: Being a Daring Tale of a Singular Girl's Adventure Upon the High Seas",
   0.7682402729988098),
  ('September 27, 2004', 0.7669798135757446),
  ('August 21, 2017', 0.7668962478637695),
  ('March 15, 2012', 0.7657756209373474),
  ('October 08, 2017', 0.7656553387641907),
  ('July 01, 1985', 0.765381932258606),
  ('Sybex Inc', 0.7651218175888062),
  ('November 09, 2010', 0.7644745707511902),
  ('April 19, 2003'

In [128]:
getClosestWordEmbedding(fastTextModelPath, 5)

Finished loading model in 0.36 sec.
Number of words = 18405, vector size = 100
Clustering ... Finished clustering in 3.61 sec.


[[('Ben Caldwell', 0.798562228679657),
  ('Carrie Hope Fletcher', 0.7798928022384644),
  ('Darryl Bailey', 0.779863715171814),
  ('Karen Chance', 0.7791794538497925),
  ('Apsley Cherry-Garrard', 0.7774773836135864),
  ('Dan Parry', 0.777237057685852),
  ('Forever Road (Peri Jean Mace Ghost Thriller)', 0.7764514088630676),
  ('Sophie Chen Keller', 0.7706818580627441),
  ('Alan Lathwell', 0.7691977024078369),
  ('Laurie Gwen Shapiro', 0.7670925855636597)],
 [('May 05, 2016', 0.9181225299835205),
  ('May 5, 2016', 0.9158812761306763),
  ('May 7, 2013', 0.9122775793075562),
  ('May 25, 2016', 0.9120886921882629),
  ('May 6, 2016', 0.9098081588745117),
  ('May 7, 2016', 0.9093211889266968),
  ('May 14, 2013', 0.9086767435073853),
  ('May 5, 2015', 0.9072896242141724),
  ('May 05, 2015', 0.9059444069862366),
  ('May 5, 2014', 0.9057948589324951)],
 [('The Coming of Conan the Cimmerian: The Original Adventures of the Greatest Sword and Sorcery Hero of All Time!',
   0.9045103192329407),
  ('T