## Part 1: Load Data

In [1]:
import numpy as np
import pandas as pd
import nltk
# import gensim
import gzip
import json

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/zheng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data into dataframe

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('/Users/zheng/Desktop/dataset/Software.json.gz')

In [3]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"03 11, 2014",A240ORQ2LF9LUI,77613252,{'Format:': ' Loose Leaf'},Michelle W,The materials arrived early and were in excell...,Material Great,1394496000,,
1,4.0,True,"02 23, 2014",A1YCCU0YRLS0FE,77613252,{'Format:': ' Loose Leaf'},Rosalind White Ames,I am really enjoying this book with the worksh...,Health,1393113600,,
2,1.0,True,"02 17, 2014",A1BJHRQDYVAY2J,77613252,{'Format:': ' Loose Leaf'},Allan R. Baker,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?,1392595200,7.0,
3,3.0,True,"02 17, 2014",APRDVZ6QBIQXT,77613252,{'Format:': ' Loose Leaf'},Lucy,This book was missing pages!!! Important pages...,missing pages!!,1392595200,3.0,
4,5.0,False,"10 14, 2013",A2JZTTBSLS1QXV,77775473,,Albert V.,I have used LearnSmart and can officially say ...,Best study product out there!,1381708800,,


In [4]:
# check missing value
df.isnull().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             225035
reviewerName          24
reviewText            66
summary               56
unixReviewTime         0
vote              331583
image             457928
dtype: int64

In [5]:
# remove missing value
df.dropna(subset=['reviewText'],inplace=True)

In [6]:
df.reset_index(inplace=True, drop=True)

In [7]:
# use the first 20000 data as our training data
data = df.loc[:19999, 'reviewText'].tolist()

## Part 2: Tokenizing and Stemming

In [8]:
# use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english') #stopwords.append("n't")
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("n") 
stopwords.append("software")

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:10])

We use 183 stop-words from nltk library.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [9]:
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # exclude stop words and tokenize the document, generate a list of string 
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [10]:
tokenization_and_stemming(data[0])

['materi',
 'arriv',
 'earli',
 'excel',
 'condit',
 'howev',
 'money',
 'spent',
 'realli',
 'come',
 'binder',
 'loos',
 'leaf']

In [11]:
data[0]

"The materials arrived early and were in excellent condition.  However for the money spent they really should've come with a binder and not just loose leaf."

## Part 3: TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
# TfidfVectorizer will help us to create tf-idf matrix
# max_df : maximum document frequency for the given word
# min_df : minimum document frequency for the given word
# max_features: maximum number of words
# use_idf: if not true, we only calculate tf
# stop_words : built-in stop words
# tokenizer: how to tokenize the document
# ngram_range: (min_value, max_value), eg. (1, 3) means the result will include 1-gram, 2-gram, 3-gram
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                              min_df=0.01, stop_words='english',
                              use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")



In total, there are 20000 reviews and 823 terms.


In [13]:
# save the words identified by TF-IDF
tf_selected_words = tfidf_model.get_feature_names_out()

## Part 4: K-means clustering

In [14]:
# k-means clustering
from sklearn.cluster import KMeans

num_clusters = 4

# number of clusters
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [15]:
# create DataFrame films from all of the input files.
product = { 'review': df[:20000].reviewText, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [16]:
frame.head(10)

Unnamed: 0,review,cluster
0,The materials arrived early and were in excell...,0
1,I am really enjoying this book with the worksh...,0
2,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",0
3,This book was missing pages!!! Important pages...,0
4,I have used LearnSmart and can officially say ...,0
5,"Strong backgroung, good read, quite up to date...",0
6,If you live on Mars and never heard of the int...,0
7,i got this book on amazon and it ended up savi...,0
8,I was very happy with this purchase because th...,0
9,Recieved in a timely manner- book in great con...,0


In [17]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of reviews included in each cluster:


Unnamed: 0,cluster
0,12156
1,4029
2,2850
3,965


In [18]:
print ("<Document clustering result by K-means>")

#km.cluster_centers_ denotes the importances of each items in centroid.
#We need to sort it in decreasing-order and get the top k items.
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :15]: #replace 15 with n words per cluster
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()

<Document clustering result by K-means>
Cluster 0 words:use,product,program,work,great,good,version,like,time,year,easi,learn,need,money,tri,
Cluster 1 words:window,instal,xp,comput,work,run,norton,problem,program,use,os,product,version,upgrad,new,
Cluster 2 words:game,play,love,fun,kid,old,great,like,enjoy,nanci,year,realli,son,daughter,learn,
Cluster 3 words:offic,microsoft,ms,use,word,version,mac,product,work,suit,need,price,document,excel,instal,


## Part 5: Topic Modeling - Latent Dirichlet Allocation

In [19]:
# use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=4)

In [20]:
# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

(20000, 4)
[[0.07331403 0.06696059 0.79033366 0.06939171]
 [0.08337635 0.08181271 0.0872994  0.74751153]
 [0.32904769 0.07440357 0.52510464 0.0714441 ]
 ...
 [0.05156374 0.84868997 0.04991719 0.0498291 ]
 [0.03660801 0.89281158 0.03558806 0.03499235]
 [0.06137191 0.8130949  0.06461468 0.06091851]]


In [21]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(4, 823)
[[ 13.41443756  49.12097535  13.81666443 ...   2.22903622 186.65130569
   12.96054006]
 [  9.80149328  63.225375    25.44650645 ... 403.66490176  88.37709587
   26.94542099]
 [ 37.94713146  76.84602521  16.88041174 ...  18.51979822  74.43364108
   15.69842163]
 [  7.37752561  30.52708713  25.9790145  ...   0.78124723 159.55457001
    8.67242563]]


In [22]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,topic
Doc0,0.07,0.07,0.79,0.07,2
Doc1,0.08,0.08,0.09,0.75,3
Doc2,0.33,0.07,0.53,0.07,2
Doc3,0.08,0.07,0.78,0.07,2
Doc4,0.19,0.05,0.7,0.05,2
Doc5,0.07,0.07,0.8,0.07,2
Doc6,0.08,0.44,0.39,0.08,1
Doc7,0.33,0.06,0.54,0.07,2
Doc8,0.73,0.09,0.09,0.09,0
Doc9,0.08,0.07,0.77,0.08,2


In [23]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
1,6118
2,6001
0,4145
3,3736


In [24]:
# topic word matrix
print(lda.components_)
# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words.head()

[[ 13.41443756  49.12097535  13.81666443 ...   2.22903622 186.65130569
   12.96054006]
 [  9.80149328  63.225375    25.44650645 ... 403.66490176  88.37709587
   26.94542099]
 [ 37.94713146  76.84602521  16.88041174 ...  18.51979822  74.43364108
   15.69842163]
 [  7.37752561  30.52708713  25.9790145  ...   0.78124723 159.55457001
    8.67242563]]


Unnamed: 0,abil,abl,absolut,accept,access,account,act,activ,actual,ad,...,wors,worst,worth,write,written,wrong,x,xp,year,yes
Topic0,13.414438,49.120975,13.816664,18.391069,18.913425,113.014818,11.006753,78.394654,25.472139,26.023344,...,18.387131,26.059047,41.971626,12.911204,8.837372,30.297878,0.279184,2.229036,186.651306,12.96054
Topic1,9.801493,63.225375,25.446506,11.679278,53.933266,11.016197,65.288977,35.453149,46.191911,33.402185,...,24.641164,29.32718,47.002538,22.555755,15.74039,33.956513,69.966436,403.664902,88.377096,26.945421
Topic2,37.947131,76.846025,16.880412,5.721158,40.775366,3.19564,1.051999,4.76969,50.301633,33.772845,...,4.180134,4.160612,64.928701,75.79693,27.843203,17.068288,15.174214,18.519798,74.433641,15.698422
Topic3,7.377526,30.527087,25.979015,0.602154,0.477439,0.253825,3.921913,42.247571,33.354503,5.793535,...,0.908189,3.720994,27.706233,8.080571,2.268055,10.937177,0.256767,0.781247,159.55457,8.672426


In [26]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,product,quicken,use,money,year,tax,support,map,version,work,intuit,turbotax,program,return,time
Topic 1,window,instal,work,xp,comput,run,problem,use,program,norton,product,version,os,upgrad,drive
Topic 2,use,program,offic,great,product,good,easi,word,work,learn,need,price,version,like,excel
Topic 3,game,play,love,fun,kid,old,great,learn,enjoy,son,year,like,daughter,realli,time
