# Part 1: Load Data

In [2]:
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
df = pd.read_csv('data.tsv', sep='\t', on_bad_lines = 'skip')

In [4]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,3653882,R3O9SGZBVQBV76,B00FALQ1ZC,937001370,"Invicta Women's 15150 ""Angel"" 18k Yellow Gold ...",Watches,5,0,0,N,Y,Five Stars,Absolutely love this watch! Get compliments al...,2015-08-31
1,US,14661224,RKH8BNC3L5DLF,B00D3RGO20,484010722,Kenneth Cole New York Women's KC4944 Automatic...,Watches,5,0,0,N,Y,I love thiswatch it keeps time wonderfully,I love this watch it keeps time wonderfully.,2015-08-31
2,US,27324930,R2HLE8WKZSU3NL,B00DKYC7TK,361166390,Ritche 22mm Black Stainless Steel Bracelet Wat...,Watches,2,1,1,N,Y,Two Stars,Scratches,2015-08-31
3,US,7211452,R31U3UH5AZ42LL,B000EQS1JW,958035625,Citizen Men's BM8180-03E Eco-Drive Stainless S...,Watches,5,0,0,N,Y,Five Stars,"It works well on me. However, I found cheaper ...",2015-08-31
4,US,12733322,R2SV659OUJ945Y,B00A6GFD7S,765328221,Orient ER27009B Men's Symphony Automatic Stain...,Watches,4,0,0,N,Y,"Beautiful face, but cheap sounding links",Beautiful watch face. The band looks nice all...,2015-08-31


In [5]:
df.dropna(subset=['review_body'],inplace=True)

In [6]:
df.reset_index(inplace=True, drop=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960045 entries, 0 to 960044
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960045 non-null  object
 1   customer_id        960045 non-null  int64 
 2   review_id          960045 non-null  object
 3   product_id         960045 non-null  object
 4   product_parent     960045 non-null  int64 
 5   product_title      960043 non-null  object
 6   product_category   960045 non-null  object
 7   star_rating        960045 non-null  int64 
 8   helpful_votes      960045 non-null  int64 
 9   total_votes        960045 non-null  int64 
 10  vine               960045 non-null  object
 11  verified_purchase  960045 non-null  object
 12  review_headline    960032 non-null  object
 13  review_body        960045 non-null  object
 14  review_date        960041 non-null  object
dtypes: int64(5), object(10)
memory usage: 109.9+ MB


In [8]:
df.nunique()

marketplace               1
customer_id          719420
review_id            960045
product_id           149650
product_parent       141810
product_title        143429
product_category          1
star_rating               5
helpful_votes           284
total_votes             309
vine                      2
verified_purchase         2
review_headline      456674
review_body          873426
review_date            4173
dtype: int64

In [9]:
# use the first 10000 data for training
data = df.loc[:9999, 'review_body'].tolist()

# Part 2: Tokenizing and Stemming

In [10]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("br")
stopwords.append("watch")

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:10])

We use 183 stop-words from nltk library.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [11]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

def tokenization_and_stemming(text):
    tokens = []
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []

    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)

    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [12]:
data[0]

'Absolutely love this watch! Get compliments almost every time I wear it. Dainty.'

In [13]:
tokenization_and_stemming(data[0])

['absolut',
 'love',
 'get',
 'compliment',
 'almost',
 'everi',
 'time',
 'wear',
 'dainti']

# Part 3: TF-IDF

TF: Term Frequency

IDF: Inverse Document Frequency

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer(max_df = 0.99, max_features=1000,
                                 min_df = 0.01, stop_words='english',
                                 use_idf = True, tokenizer = tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(data)

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")



In total, there are 10000 reviews and 224 terms.


In [15]:
tfidf_matrix

<10000x224 sparse matrix of type '<class 'numpy.float64'>'
	with 67788 stored elements in Compressed Sparse Row format>

In [16]:
tfidf_matrix.toarray()

array([[0.       , 0.5284014, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [17]:
data[0]

'Absolutely love this watch! Get compliments almost every time I wear it. Dainty.'

In [18]:
tfidf_matrix.toarray()[0]

array([0.        , 0.5284014 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.46828704, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.47140578, 0.        , 0.        , 0.        , 0.     

In [19]:
tfidf_matrix.todense()

matrix([[0.       , 0.5284014, 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ],
        ...,
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ]])

In [20]:
tf_selected_words = tfidf_model.get_feature_names_out()

In [21]:
tf_selected_words

array(['abl', 'absolut', 'accur', 'actual', 'adjust', 'alarm', 'alreadi',
       'alway', 'amaz', 'amazon', 'anoth', 'appear', 'arriv', 'attract',
       'automat', 'awesom', 'bad', 'band', 'batteri', 'beauti', 'best',
       'better', 'big', 'bit', 'black', 'blue', 'bought', 'box',
       'bracelet', 'brand', 'broke', 'button', 'buy', 'ca', 'came',
       'case', 'casio', 'chang', 'cheap', 'clasp', 'classi', 'clear',
       'clock', 'color', 'come', 'comfort', 'compliment', 'cool', 'cost',
       'coupl', 'crystal', 'cute', 'dark', 'date', 'daughter', 'day',
       'deal', 'definit', 'design', 'dial', 'differ', 'difficult',
       'digit', 'disappoint', 'display', 'durabl', 'easi', 'easili',
       'eleg', 'end', 'everi', 'everyday', 'everyth', 'exact', 'excel',
       'expect', 'expens', 'face', 'far', 'fast', 'featur', 'feel',
       'fell', 'figur', 'fine', 'fit', 'function', 'gave', 'gift',
       'glass', 'goe', 'gold', 'good', 'got', 'great', 'hand', 'happi',
       'hard', 'hea

# Part 4: K-means clustering

In [22]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters = num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()



## 4.1. Analyze K-means Result

In [23]:
product = { 'review': df[:10000].review_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [24]:
frame.head(10)

Unnamed: 0,review,cluster
0,Absolutely love this watch! Get compliments al...,1
1,I love this watch it keeps time wonderfully.,2
2,Scratches,1
3,"It works well on me. However, I found cheaper ...",1
4,Beautiful watch face. The band looks nice all...,1
5,"i love this watch for my purpose, about the pe...",1
6,"for my wife and she loved it, looks great and ...",4
7,I was about to buy this thinking it was a Swis...,1
8,Watch is perfect. Rugged with the metal &#34;B...,4
9,Great quality and build.<br />The motors are r...,1


In [25]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of reviews included in each cluster:


Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
1,6926
4,1087
2,784
0,648
3,555


In [26]:
km.cluster_centers_

array([[0.        , 0.        , 0.00167136, ..., 0.00316623, 0.00438366,
        0.00162609],
       [0.00547366, 0.00573539, 0.00646151, ..., 0.00927103, 0.02484534,
        0.01655824],
       [0.00099507, 0.02137184, 0.        , ..., 0.00079672, 0.00572715,
        0.00529424],
       [0.        , 0.        , 0.00203509, ..., 0.00226703, 0.00753794,
        0.00093622],
       [0.00142724, 0.00391687, 0.00530245, ..., 0.00555955, 0.00885511,
        0.01276229]])

In [27]:
km.cluster_centers_.shape

(5, 224)

In [28]:
print ("<Document clustering result by K-means>")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]:
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()

    cluster_reviews = frame[frame.cluster==i].review.tolist()

    print ("Cluster " + str(i) + " reviews (" + str(len(cluster_reviews)) + " reviews): ")
    print (", ".join(cluster_reviews[:3]))
    print ()

<Document clustering result by K-means>
Cluster 0 words:good,product,look,qualiti,price,recommend,
Cluster 0 reviews (648 reviews): 
very good, Beyond my expectation..excellent product..good quality, well built, nicely done..:), It's a good value, and a good functional watch strap.  It's super wide though, and takes more space on the wrist than I'd like.

Cluster 1 words:look,like,time,band,work,beauti,
Cluster 1 reviews (6926 reviews): 
Absolutely love this watch! Get compliments almost every time I wear it. Dainty., Scratches, It works well on me. However, I found cheaper prices in other places after making the purchase

Cluster 2 words:love,gift,husband,beauti,wife,bought,
Cluster 2 reviews (784 reviews): 
I love this watch it keeps time wonderfully., Love this watch, I just received it yesterday it looks really nice on my  wrist, my friends and family love it., Grand Kids loved this

Cluster 3 words:nice,look,price,love,realli,like,
Cluster 3 reviews (555 reviews): 
Nice watch, on 

# Part 5: Topic Modeling by LDA

In [29]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5)

In [30]:
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

(10000, 5)
[[0.76156889 0.0600195  0.05931358 0.05955433 0.05954369]
 [0.65968717 0.08728972 0.0835316  0.08528192 0.08420958]
 [0.10000132 0.59999514 0.1000015  0.10000142 0.10000061]
 ...
 [0.04342913 0.04472167 0.04344157 0.04410923 0.8242984 ]
 [0.35940305 0.05793742 0.05584966 0.06336997 0.46343991]
 [0.07401228 0.07579229 0.45272946 0.07472089 0.32274508]]


In [31]:
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(5, 224)
[[  0.20242032  57.84847137   0.20098646 ...  38.3858435    0.2071309
    9.14859186]
 [ 29.69121018   0.4302632   34.77051711 ...  28.78922941  28.82172187
  119.00755712]
 [  0.201325     0.20119154   1.03873709 ...   0.20269124   0.2011039
    0.20148461]
 [  2.17965829   0.20190852   9.53123283 ...   0.2017893    0.20112596
    0.2823354 ]
 [  8.96748905   3.05464412   8.18720543 ...   7.60940853 164.78752914
    6.63901089]]


In [32]:
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

doc_names = ["Doc" + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,topic
Doc0,0.76,0.06,0.06,0.06,0.06,0
Doc1,0.66,0.09,0.08,0.09,0.08,0
Doc2,0.1,0.6,0.1,0.1,0.1,1
Doc3,0.06,0.74,0.07,0.07,0.06,1
Doc4,0.04,0.43,0.04,0.04,0.45,4
Doc5,0.61,0.07,0.07,0.16,0.07,0
Doc6,0.35,0.06,0.06,0.45,0.06,3
Doc7,0.06,0.75,0.06,0.06,0.06,1
Doc8,0.13,0.05,0.05,0.05,0.73,4
Doc9,0.06,0.75,0.06,0.07,0.06,1


In [33]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
1,3447
4,2501
0,1552
2,1264
3,1236


In [34]:
print(lda.components_)
df_topic_words = pd.DataFrame(lda.components_)

df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words.head()

[[  0.20242032  57.84847137   0.20098646 ...  38.3858435    0.2071309
    9.14859186]
 [ 29.69121018   0.4302632   34.77051711 ...  28.78922941  28.82172187
  119.00755712]
 [  0.201325     0.20119154   1.03873709 ...   0.20269124   0.2011039
    0.20148461]
 [  2.17965829   0.20190852   9.53123283 ...   0.2017893    0.20112596
    0.2823354 ]
 [  8.96748905   3.05464412   8.18720543 ...   7.60940853 164.78752914
    6.63901089]]


Unnamed: 0,abl,absolut,accur,actual,adjust,alarm,alreadi,alway,amaz,amazon,...,week,weight,white,wife,wish,work,worn,worth,wrist,year
Topic0,0.20242,57.848471,0.200986,3.503585,0.200913,0.200736,14.324797,0.202461,0.201215,7.115829,...,8.299288,0.202207,0.20103,61.34463,0.201482,2.332036,7.793436,38.385844,0.207131,9.148592
Topic1,29.69121,0.430263,34.770517,19.733022,24.479337,41.261404,26.520324,23.545364,0.623094,48.254492,...,92.206772,4.78348,21.765971,0.253633,37.433861,235.234705,22.752155,28.789229,28.821722,119.007557
Topic2,0.201325,0.201192,1.038737,0.201863,0.200592,0.200255,0.200664,0.20133,0.201159,1.571809,...,0.202068,0.387324,0.200634,0.20121,0.20178,18.683651,0.201815,0.202691,0.201104,0.201485
Topic3,2.179658,0.201909,9.531233,2.61642,0.201114,0.332921,0.20117,0.201833,85.938494,2.563182,...,0.201795,0.200793,2.952472,0.200771,0.202916,53.81369,0.202233,0.201789,0.201126,0.282335
Topic4,8.967489,3.054644,8.187205,22.113884,41.798503,0.237408,0.202032,28.430748,0.201778,9.2879,...,3.123017,45.597333,13.723333,8.461869,12.430889,45.92296,8.750177,7.609409,164.787529,6.639011


In [35]:
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,love,gift,thank,compliment,husband,bought,got,wear,wife,absolut,pretti,lot,son,nice,daughter
Topic 1,nice,work,time,use,batteri,look,day,year,like,band,set,watch,week,light,wear
Topic 2,good,excel,product,qualiti,awesom,like,recommend,price,fast,ok,nice,look,eleg,item,ship
Topic 3,great,beauti,expect,look,price,amaz,love,cute,exact,arriv,product,pictur,work,classi,stylish
Topic 4,band,perfect,look,wrist,fit,small,big,size,nice,like,easi,face,cheap,strap,cool
