# Exploring representative tuples by clustering the embedding space

In [1]:
import warnings
warnings.filterwarnings('ignore')

import time
from sklearn.cluster import KMeans, Birch
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd

import numpy as np
import h5py

## Perform clustering and return the cluster centers

In [2]:
word2VecModelPath = 'amazonModelWord2Vec.w2v'
fastTextModelPath = 'amazonModelFastText.w2v'

Clustering with KMeans

In [3]:
def getClusterCentersWithKMeans(model, numberOfClusters):
    # Get the word vectors of the model
    word_vectors = model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("Number of words = {0}, vector size = {1}".format(n_words, vec_size))

    # Cluster using KMeans
    start = time.time()
    print("Clustering ... ", end="", flush=True)
    kmeans = KMeans(n_clusters=numberOfClusters, n_jobs=-1, random_state=0)
    idx = kmeans.fit_predict(word_vectors)
    print("Finished clustering in {:.2f} sec.".format(time.time() - start), flush=True)

    # Return cluster centers
    return kmeans.cluster_centers_

## Get the closest vector to each of the cluster centers
We'll pass the number of cluster centers as an argument. This can be thought of as a drill down equivalent. Greater the number of cluster centers, more detailed will be the resulting results returned. 

Number of clusters chosen is 3 by default. This can be overriden, if needed.

In [4]:
def getClosestWordEmbedding(modelPath, numberOfClusters = 3):
    # Load the model
    start = time.time()
    model = KeyedVectors.load(modelPath)
    print("Finished loading model in {:.2f} sec.".format(time.time() - start), flush=True)
    
    clusterCenters = getClusterCentersWithKMeans(model, numberOfClusters)
    
    # Create an empty numpy array of size equal to cluster centers to store the closest words
    closestWords = []
    
    # Get the closest word for each of the cluster centers
    for clusterCenter in clusterCenters:
            closestWords.append(model.similar_by_vector(clusterCenter))
    
    return closestWords

In [5]:
getClosestWordEmbedding(word2VecModelPath)

Finished loading model in 0.20 sec.
Number of words = 18473, vector size = 100
Clustering ... Finished clustering in 3.36 sec.


[[('Pokémon Adventures: Diamond and Pearl/Platinum, Vol. 9',
   0.7363650798797607),
  ('July 09, 2000', 0.7359441518783569),
  ('Gorilla Adventure', 0.7359086275100708),
  ('June 04, 2001', 0.7354663610458374),
  ('Lion Adventure', 0.7346001863479614),
  ('Milda Harris', 0.7345806956291199),
  ('The Adventures of Danny Meadow Mouse', 0.7342305183410645),
  ('June 18, 2013', 0.7342035174369812),
  ('The Adventures of Buster Bear', 0.7334026098251343),
  ("Arthur's Computer Disaster: An Arthur Adventure", 0.7328624129295349)],
 [('April 30, 2008', 0.7729446887969971),
  ("Piratica: Being a Daring Tale of a Singular Girl's Adventure Upon the High Seas",
   0.7682402729988098),
  ('September 27, 2004', 0.7669798135757446),
  ('August 21, 2017', 0.7668962478637695),
  ('March 15, 2012', 0.7657756209373474),
  ('October 08, 2017', 0.7656553387641907),
  ('July 01, 1985', 0.765381932258606),
  ('Sybex Inc', 0.7651218175888062),
  ('November 09, 2010', 0.7644745707511902),
  ('April 19, 2003'

In [6]:
getClosestWordEmbedding(fastTextModelPath, 100)

Finished loading model in 0.95 sec.
Number of words = 18405, vector size = 100
Clustering ... Finished clustering in 9.23 sec.


[[('February 21, 1984', 0.987946629524231),
  ('February 02, 1999', 0.9831345081329346),
  ('February 02, 1995', 0.9826117753982544),
  ('February 3, 1997', 0.982056200504303),
  ('February 04, 1992', 0.9809597730636597),
  ('February 18, 1998', 0.9805912971496582),
  ('February 2, 1998', 0.9801206588745117),
  ('February 18, 1980', 0.9789025187492371),
  ('February 06, 1986', 0.9783583879470825),
  ('February 18, 1995', 0.9781572818756104)],
 [('The Bear and the Serpent (Echoes of the Fall)', 0.9501677751541138),
  ('On the Edge of the Dark Sea of Darkness (The Wingfeather Saga)',
   0.9456765651702881),
  ('The Hobbit and the Lord of the Rings (the Hobbit / the Fellowship of the Ring / the Two Towers / the',
   0.9452135562896729),
  ('Inherit the Wind: The Powerful Drama of the Greatest Courtroom Clash of the Century',
   0.941615641117096),
  ('Dark Heresy: The Church of the Damned: Roleplaying in the Grim Darkness of the 41st Millennium',
   0.9405147433280945),
  ('The Return of 