# K-Medoids Clustering on Amazon Food Reviews

# OBJECTIVE: To perform K-Medoids clustering on Amazon Food reviews data and observe the different clusters obtained

In [1]:

#=====================Importing the required libraries=========================#

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer


In [2]:
# Importing the required sqlite version of the Amazon Food Reviews dataset

# Establishing a connection with the database
con = sqlite3.connect('database.sqlite')

# Storing the data in a dataframe acquired from the connection

reviews_data = pd.read_sql_query("""SELECT * FROM Reviews""", con)

In [3]:
# Checking the first few rows of the data to ensure we have the right data

reviews_data.head(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [4]:
# Checking the last few rows of the data to ensure we have the right data

reviews_data.tail(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
568444,568445,B001EO7N10,A2SD7TY3IOX69B,"BayBay ""BayBay Knows Best""",3,3,5,1245369600,Best Value for Chinese 5 Spice,"As a foodie, I use a lot of Chinese 5 Spice po..."
568445,568446,B001EO7N10,A2E5C8TTAED4CQ,S. Linkletter,2,2,5,1268006400,Five Spice Powder,"You can make this mix yourself, but the Star A..."
568446,568447,B001EO7N10,A2P9W8T7NTLG2Z,Andy,0,0,2,1328918400,Mixed wrong,I had ordered some of these a few months back ...
568447,568448,B001EO7N10,APWCOAVILK94B,"Real Named Person ""wowzee""",0,0,5,1322524800,"If its all natural, this is like panacea of Sp...","Hoping there is no MSG in this, this tastes ex..."
568448,568449,B001EO7N10,A1F6BHEYB7R6R7,James Braley,0,0,5,1308096000,Very large ground spice jars.,My only complaint is that there's so much of i...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...
568453,568454,B001LR2CU2,A3LGQPJCZVL9UC,srfell17,0,0,5,1338422400,Great Honey,"I am very satisfied ,product is as advertised,..."


In [5]:
y = reviews_data[['Score']]

In [6]:
'''
For further processing and obtaining the number of clusters required we shall require only the text data, therefore we 
will extract only the text variable from the dataset.
'''

text_data = reviews_data[['Text']]

In [7]:
# Taking a look at the first few rows of the data to ensure that we have the right data

text_data.head(10)

# We can say that we have the right data

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...
5,I got a wild hair for taffy and ordered this f...
6,This saltwater taffy had great flavors and was...
7,This taffy is so good. It is very soft and ch...
8,Right now I'm mostly just sprouting this so my...
9,This is a very healthy dog food. Good for thei...


In [8]:
# Let us look at the shape of the data

text_data.shape

# The data has 568454 rows and 1 column

(568454, 1)

In [9]:
# Extracting the vaues of text_data in X1 to avoid unhashable slice error

X1 = text_data.iloc[:,:].values

In [10]:
# As the time complexity for hierarchical clustering is large, we will extract only 4000 reviews and perform clustering

import random

n = 568454
m = 4000
p = m/n

sample_reviews = [];

for i in range(0,n):
    if random.random() <= p:
        sample_reviews.append(X1[i,:])

In [11]:
text_reviews = pd.DataFrame(sample_reviews)

In [12]:
# Assigning the name to the columns

text_reviews.columns = ['Reviews']

In [13]:
# Checking the first few rows of the dataframe

text_reviews.head()

Unnamed: 0,Reviews
0,This product serves me well as a source of ele...
1,This cat food was recommended by my vet becaus...
2,If you're looking for an energy boost without ...
3,"""These are delicious! The chocolate is excelle..."
4,After looking at the pictures someone put on h...


In [14]:
# Now we will perform the data cleaning and transformation 

# We can see that there are unknown elements like html tags in the data. We need to remove those

# find sentences containing HTML tags

import re

i=0;
for sent in text_reviews['Reviews'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1; 
    
# We can see that there is data consisting of html tags, we need to remove these tags

0
This product serves me well as a source of electrolytes during and after a long run or bike ride.<br />I have tried all of the flavors but really do like the grapefruit flavor... no after-taste and I actually like the slight carbonation.<br />I use other Hammer products and really like their whole product line.


In [15]:
'''
We will perform the data cleaning steps on the text data.For that we will perform word stemmatization and 
cleaning html and punctuation marks.
'''

#import re
'''import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer'''

nltk.download('stopwords')

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kulkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Applying the stopwords removal, html removal, punctuation marks removal on the text_data dataframe

#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
#all_positive_words=[] # store words from +ve reviews here
#all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in text_reviews['Reviews'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    #if (final['Score'].values)[i] == 'positive': 
                    #    all_positive_words.append(s) #list of all words used to describe positive reviews
                    #if(final['Score'].values)[i] == 'negative':
                    #    all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [17]:
# Assigning the 'final_string' array to the 'text_data' dataframe

text_reviews['Cleaned_Text'] = final_string

In [18]:
# Checking whether the cleaned text has been assigned to the text_data dataframe

text_reviews.head()

# We can see that the data has been assigned to the dataframe

Unnamed: 0,Reviews,Cleaned_Text
0,This product serves me well as a source of ele...,b'product serv well sourc electrolyt long run ...
1,This cat food was recommended by my vet becaus...,b'cat food recommend vet year old cleo cat tro...
2,If you're looking for an energy boost without ...,b'your look energi boost without artifici swee...
3,"""These are delicious! The chocolate is excelle...",b'delici chocol excel espresso bean perfect ro...
4,After looking at the pictures someone put on h...,b'look pictur someon put show crush box write ...


In [19]:
# Checking the last few rows of the dataframe

text_reviews.tail()

# We can say that we have the right data after cleaning the text

Unnamed: 0,Reviews,Cleaned_Text
3982,"Laura Scudder's Organic Smooth Peanut Butter, ...",b'laura scudder organ smooth peanut butter buy...
3983,My cats haven't been this excited to eat since...,b'cat havent excit eat sinc time fed day fresh...
3984,I'd never heard of this brand but they were ve...,b'never heard brand cheap amazon extrem surpri...
3985,"Everyone has their own preferences, I know -- ...",b'everyon prefer know actual like flavor wife ...
3986,The salt liocorice was very good. It came nic...,b'salt liocoric good came nice moist even thre...


In [20]:
# Let us once check the dimensions of the dataframe just to be sure

text_reviews.shape

(3987, 2)

In [21]:
# We will only take the cleaned version of the text and make a new dataframe

final_text = text_reviews[['Cleaned_Text']].copy()

In [22]:
# Checking the first few rows of the data

final_text.head()

Unnamed: 0,Cleaned_Text
0,b'product serv well sourc electrolyt long run ...
1,b'cat food recommend vet year old cleo cat tro...
2,b'your look energi boost without artifici swee...
3,b'delici chocol excel espresso bean perfect ro...
4,b'look pictur someon put show crush box write ...


In [23]:
final_text.shape

(3987, 1)

# Creating different models of text representation and applying hierarchical clustering

# Creating a BoW representation of the reviews data

In [25]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final_text['Cleaned_Text'].values).todense()

In [26]:
# final_counts is in the form of matrix, we will convert it to an array

#A = np.squeeze(np.asarray(M))

final_counts_array = np.squeeze(np.asarray(final_counts))


In [27]:
# Checking the type of final_counts_array to be sure

print(type(final_counts_array))

# We can see that it is a n-dimensional numpy array.

<class 'numpy.ndarray'>


In [28]:
import pyclust

In [29]:
from pyclust import KMedoids

In [32]:
clusters =KMedoids(n_clusters=2)
clusters.fit(D)


# Applying K-Medoids Clustering to BoW representation of reviews data


# Although the data has been divided into 5 clusters, for our covenience we have divided the data into positive and negative classes. We will use k-medoids to divide the data into 2 clusters and observe whether right clusters have been created.

# Considering 2 as the number of clusters

In [61]:
# import pyclust library for K-Medoids clustering
import pyclust
from pyclust import KMedoids

# Creating 2 as the number of clusters 
clustersbow_2 = KMedoids(n_clusters=2)

# Fitting the number of clusters to our bow representation of the text data
clustersbow_2.fit(final_counts_array)

In [63]:
# Extracting the cluster labels of the created model

clustersbow_2.labels_

# Assigning the cluster labels to our data

text_reviews['Clustersbow2'] = clustersbow_2.labels_

# Reading the top 10 reviews

In [64]:
# We will take a look at the first few rows of the data frame

text_reviews.head(10)

Unnamed: 0,Reviews,Cleaned_Text,Clustersbow2
0,This product serves me well as a source of ele...,b'product serv well sourc electrolyt long run ...,0
1,This cat food was recommended by my vet becaus...,b'cat food recommend vet year old cleo cat tro...,0
2,If you're looking for an energy boost without ...,b'your look energi boost without artifici swee...,0
3,"""These are delicious! The chocolate is excelle...",b'delici chocol excel espresso bean perfect ro...,0
4,After looking at the pictures someone put on h...,b'look pictur someon put show crush box write ...,0
5,If it were possible to give this product zero ...,b'possibl give product zero star would done hu...,0
6,To be fair only one of my twins got gas from t...,b'fair one twin got gas horribl night scream g...,0
7,This is my favorite hot sauce. I buy it locall...,b'favorit hot sauc buy local love flavor hope ...,0
8,My cats love this food. For the money it is a ...,b'cat love food money great product get everi ...,0
9,For christmas I ordered some coupons from my d...,b'christma order coupon daughter school ohama ...,0


In [65]:
# Let us check the unique values of the clustersbow2 column

text_reviews['Clustersbow2'].unique()

array([0, 1], dtype=int64)

In [66]:
# Reading a few rows of the cluster label 0

text_reviews.loc[text_reviews['Clustersbow2'] == 0]

Unnamed: 0,Reviews,Cleaned_Text,Clustersbow2
0,This product serves me well as a source of ele...,b'product serv well sourc electrolyt long run ...,0
1,This cat food was recommended by my vet becaus...,b'cat food recommend vet year old cleo cat tro...,0
2,If you're looking for an energy boost without ...,b'your look energi boost without artifici swee...,0
3,"""These are delicious! The chocolate is excelle...",b'delici chocol excel espresso bean perfect ro...,0
4,After looking at the pictures someone put on h...,b'look pictur someon put show crush box write ...,0
5,If it were possible to give this product zero ...,b'possibl give product zero star would done hu...,0
6,To be fair only one of my twins got gas from t...,b'fair one twin got gas horribl night scream g...,0
7,This is my favorite hot sauce. I buy it locall...,b'favorit hot sauc buy local love flavor hope ...,0
8,My cats love this food. For the money it is a ...,b'cat love food money great product get everi ...,0
9,For christmas I ordered some coupons from my d...,b'christma order coupon daughter school ohama ...,0


# Conslusion from cluster 0

# Reading the reviews with cluster label 1

In [67]:
text_reviews.loc[text_reviews['Clustersbow2'] == 1]

Unnamed: 0,Reviews,Cleaned_Text,Clustersbow2
938,Diamond Almonds<br />Almonds are a good source...,b'diamond almond almond good sourc magnesium o...,1


# Conclusion from cluster label 1

In [68]:
# Let us look at how many unique values have been assigned to each cluster

text_reviews['Clustersbow2'].value_counts()

# We can see that the number of clusters into which the data has been divided is quite unbalanced, we can try increasing the number of clusters

0    3986
1       1
Name: Clustersbow2, dtype: int64

# Creating a TF-IDF representation of the reviews data

In [51]:
tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(final_text['Cleaned_Text'].values).todense()

In [30]:
# Checking the data-type of final_tf_idf

print(type(final_tf_idf))

# it is a matrix

<class 'numpy.matrixlib.defmatrix.matrix'>


In [31]:
# Converting the matrix to an array

final_tfidf_array = np.squeeze(np.asarray(final_tf_idf))


In [32]:
# Checking the datatype of the array

print(type(final_tfidf_array))

<class 'numpy.ndarray'>


# Applying K-Medoids to TF-IDF representation of reviews data

# Considering 2 clusters

In [60]:
# First we will consider only 2 clusters
# Creating 2 as the number of clusters 
clusterstfidf_2 = KMedoids(n_clusters=2)

# Fitting the number of clusters to our bow representation of the text data
clusterstfidf_2.fit(final_tfidf_array)

In [69]:
# Extracting the cluster labels of the created model

clusterstfidf_2.labels_

# Assigning the cluster labels to our data

text_reviews['Clusterstfidf2'] = clusterstfidf_2.labels_

In [70]:
# Reading a few rows of the cluster label 0

text_reviews.loc[text_reviews['Clusterstfidf2'] == 0]

Unnamed: 0,Reviews,Cleaned_Text,Clustersbow2,Clusterstfidf2
0,This product serves me well as a source of ele...,b'product serv well sourc electrolyt long run ...,0,0
1,This cat food was recommended by my vet becaus...,b'cat food recommend vet year old cleo cat tro...,0,0
2,If you're looking for an energy boost without ...,b'your look energi boost without artifici swee...,0,0
3,"""These are delicious! The chocolate is excelle...",b'delici chocol excel espresso bean perfect ro...,0,0
4,After looking at the pictures someone put on h...,b'look pictur someon put show crush box write ...,0,0
5,If it were possible to give this product zero ...,b'possibl give product zero star would done hu...,0,0
6,To be fair only one of my twins got gas from t...,b'fair one twin got gas horribl night scream g...,0,0
7,This is my favorite hot sauce. I buy it locall...,b'favorit hot sauc buy local love flavor hope ...,0,0
8,My cats love this food. For the money it is a ...,b'cat love food money great product get everi ...,0,0
9,For christmas I ordered some coupons from my d...,b'christma order coupon daughter school ohama ...,0,0


In [71]:
# Reading a few rows of the cluster label 1

text_reviews.loc[text_reviews['Clusterstfidf2'] == 1]

Unnamed: 0,Reviews,Cleaned_Text,Clustersbow2,Clusterstfidf2
33,A dark coffee that has no bitterness and is sm...,b'dark coffe bitter smooth silk alway use qual...,0,1
111,We just recently joined the Keurig Craze. We'...,b'recent join keurig craze weve tri ton differ...,0,1
276,I didn't get to each much of the bar I receive...,b'didnt get much bar receiv husband purloin sa...,0,1
338,"I like Cerelac product, they tast good and hav...",b'like cerelac product tast good lot varieti d...,0,1
377,"It's cheap, it tastes fine (not really flavour...",b'cheap tast fine realli flavour bad help lose...,0,1
403,"I have been looking all over, Amazon included,...",b'look amazon includ nice tea passion fruit ta...,0,1
420,This is the best orange spice tea I have ever ...,b'best orang spice tea ever lot flavor',0,1
468,"Compared to other bagged teas, this is delicio...",b'compar bag tea delici high aromat blend chai...,0,1
476,Lipton To Go Iced Tea crystals are a convenien...,b'lipton ice tea crystal conveni way enjoy ice...,0,1
549,Nigella in Arabic are the Seed of Blessing. T...,b'nigella arab seed bless flavor deep savori l...,0,1


In [72]:
# Let us look at how many unique values have been assigned to each cluster

text_reviews['Clusterstfidf2'].value_counts()

# We can see that the number of clusters into which the data has been divided is quite unbalanced, we can try increasing the number of clusters

0    3937
1      50
Name: Clusterstfidf2, dtype: int64

# Creating AvgW2Vec representation of the reviews data

In [33]:
# Importing the required models for the project

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle



In [34]:
X2 = text_data.iloc[:,:].values

In [35]:
n = 568454
m = 4000
p = m/n

text_data_sample = [];

for i in range(0,n):
    if random.random() <= p:
        text_data_sample.append(X2[i,:])

In [36]:
text_sample = pd.DataFrame(text_data_sample)

In [37]:
text_sample.head()

Unnamed: 0,0
0,I don't know if it's the cactus or the tequila...
1,I order these olives a lot. They are un-like a...
2,This is the first time I've really been misled...
3,They are good but need more cheese and wish th...
4,Eat at your own risk. Once I would open a bag...


In [39]:
text_sample.columns = ['Text']

In [40]:
text_sample.head(5)

Unnamed: 0,Text
0,I don't know if it's the cactus or the tequila...
1,I order these olives a lot. They are un-like a...
2,This is the first time I've really been misled...
3,They are good but need more cheese and wish th...
4,Eat at your own risk. Once I would open a bag...


In [41]:
import gensim
i=0
list_of_sent=[]
for sent in text_sample['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue 
    list_of_sent.append(filtered_sentence)
    

In [42]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=3)

In [43]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

3986
50


In [44]:
print(type(sent_vectors))

<class 'list'>


In [47]:
sen_vectors_array= np.asarray(sent_vectors)

# Applying K-Medoids Clustering on the Avg W2Vec representation of the data

# Considering 2 clusters

In [59]:
import pyclust
from pyclust import KMedoids

# Creating 2 as the number of clusters 
clustersavgw2vec_2 = KMedoids(n_clusters=2)

# Fitting the number of clusters to our bow representation of the text data
clustersavgw2vec_2.fit(sen_vectors_array)

In [62]:
text_sample['Clusters_AvgW2Vec'] = clustersavgw2vec_2.labels_

# Creating Tfidf weighted W2Vec representation of the text reviews

In [52]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            #tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1



In [53]:
# Checking for any 'NaN' values in the data

np.isnan(tfidf_sent_vectors).any()

# We can see that there are na values in the data

True

In [54]:
# We will replace all the 'Nan' values w/ mean of the respective columns


from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
data_tfidf = imp.fit_transform(tfidf_sent_vectors) # Assigning the imputed matrix w/o Nan values to data variable

In [55]:
# Checking the type of 'data_tfidf' 

type(data_tfidf)

# It is an array

numpy.ndarray

In [69]:
# Checking if our imputation has succeded

np.isnan(data_tfidf).any()

# We can see that there are no na values any more.

False

# Applying K-Medoids Clustering on the Tf-Idf weighted W2Vec representation of the data

# Considering 2 clusters

In [58]:
import pyclust
from pyclust import KMedoids

# Creating 2 as the number of clusters 
clusterstfidfw2vec_2 = KMedoids(n_clusters=2)

# Fitting the number of clusters to our bow representation of the text data
clusterstfidfw2vec_2.fit(data_tfidf)

ValueError: attempt to get argmin of an empty sequence