## Part 1: Load Data

In [3]:
import numpy as np
import pandas as pd
import nltk
# import gensim
import gzip
import json

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/zheng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# load data into dataframe

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('/Users/zheng/Desktop/dataset/Software.json.gz')

In [5]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"03 11, 2014",A240ORQ2LF9LUI,77613252,{'Format:': ' Loose Leaf'},Michelle W,The materials arrived early and were in excell...,Material Great,1394496000,,
1,4.0,True,"02 23, 2014",A1YCCU0YRLS0FE,77613252,{'Format:': ' Loose Leaf'},Rosalind White Ames,I am really enjoying this book with the worksh...,Health,1393113600,,
2,1.0,True,"02 17, 2014",A1BJHRQDYVAY2J,77613252,{'Format:': ' Loose Leaf'},Allan R. Baker,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?,1392595200,7.0,
3,3.0,True,"02 17, 2014",APRDVZ6QBIQXT,77613252,{'Format:': ' Loose Leaf'},Lucy,This book was missing pages!!! Important pages...,missing pages!!,1392595200,3.0,
4,5.0,False,"10 14, 2013",A2JZTTBSLS1QXV,77775473,,Albert V.,I have used LearnSmart and can officially say ...,Best study product out there!,1381708800,,


In [6]:
# check missing value
df.isnull().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             225035
reviewerName          24
reviewText            66
summary               56
unixReviewTime         0
vote              331583
image             457928
dtype: int64

In [8]:
# remove missing value
df.dropna(subset=['reviewText'],inplace=True)

In [9]:
df.reset_index(inplace=True, drop=True)

In [11]:
# use the first 20000 data as our training data
data = df.loc[:19999, 'reviewText'].tolist()

In [12]:
data

["The materials arrived early and were in excellent condition.  However for the money spent they really should've come with a binder and not just loose leaf.",
 'I am really enjoying this book with the worksheets that make you review your goals, what to do when you do not make it, it reminds me  of my human sexuality classwork.',
 'IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR MONEY ON THIS SO CALLED BOOK! $140.00 FOR A "BOOK" THAT ISIN\'T EVEN BOUND LOOSE LEAFS, THAT I HAD TO PROVIDE MY OWN BINDER FOR. TURNS OUT YOU CAN BUY ACCESS TO THE BOOK AT MCGRAW HILL CONNECT CORE FOR $70.00\n\nTHIS BOOK IS A COMPLETE WASTE OF MONEY!',
 "This book was missing pages!!! Important pages. I couldn't answer some test questions because of it!! I have never had this happen before.",
 'I have used LearnSmart and can officially say that this is an amazing study tool that quickly and simply adapts to your style of learning. You can access it at anytime and it is on the go! Once you start using LearnSmart 

## Part 2: Tokenizing and Stemming

In [14]:
# use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english') #stopwords.append("n't")
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("n") 
stopwords.append("software")

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:10])

We use 183 stop-words from nltk library.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [15]:
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # exclude stop words and tokenize the document, generate a list of string 
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [16]:
tokenization_and_stemming(data[0])

['materi',
 'arriv',
 'earli',
 'excel',
 'condit',
 'howev',
 'money',
 'spent',
 'realli',
 'come',
 'binder',
 'loos',
 'leaf']

In [17]:
data[0]

"The materials arrived early and were in excellent condition.  However for the money spent they really should've come with a binder and not just loose leaf."

## Part 3: TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
# TfidfVectorizer will help us to create tf-idf matrix
# max_df : maximum document frequency for the given word
# min_df : minimum document frequency for the given word
# max_features: maximum number of words
# use_idf: if not true, we only calculate tf
# stop_words : built-in stop words
# tokenizer: how to tokenize the document
# ngram_range: (min_value, max_value), eg. (1, 3) means the result will include 1-gram, 2-gram, 3-gram
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                              min_df=0.01, stop_words='english',
                              use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")



In total, there are 20000 reviews and 823 terms.


In [19]:
# save the words identified by TF-IDF
tf_selected_words = tfidf_model.get_feature_names_out()

In [20]:
tf_selected_words

array(['abil', 'abl', 'absolut', 'accept', 'access', 'account', 'act',
       'activ', 'actual', 'ad', 'add', 'addit', 'address', 'admit',
       'adob', 'adult', 'advanc', 'advantag', 'adventur', 'advertis',
       'advic', 'age', 'ago', 'agre', 'ahead', 'allow', 'alon', 'alreadi',
       'altern', 'alway', 'amaz', 'amazon', 'anim', 'annoy', 'anoth',
       'answer', 'antivirus', 'anymor', 'anyon', 'anyth', 'app', 'appar',
       'appear', 'appl', 'applic', 'area', 'arriv', 'art', 'ask', 'assum',
       'attempt', 'audio', 'automat', 'avail', 'averag', 'avoid', 'awar',
       'away', 'awesom', 'background', 'backup', 'bad', 'bank', 'base',
       'basic', 'beat', 'beauti', 'becom', 'begin', 'beginn', 'believ',
       'best', 'better', 'bewar', 'bibl', 'big', 'bit', 'block', 'blue',
       'book', 'boot', 'bore', 'bother', 'bought', 'box', 'boy', 'brand',
       'break', 'bring', 'browser', 'buck', 'bug', 'buggi', 'build',
       'built', 'burn', 'busi', 'button', 'buy', 'ca', 'cabl', 

## Part 4: K-means clustering

In [21]:
# k-means clustering
from sklearn.cluster import KMeans

num_clusters = 4

# number of clusters
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [23]:
# create DataFrame films from all of the input files.
product = { 'review': df[:20000].reviewText, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [24]:
frame.head(10)

Unnamed: 0,review,cluster
0,The materials arrived early and were in excell...,1
1,I am really enjoying this book with the worksh...,1
2,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",1
3,This book was missing pages!!! Important pages...,1
4,I have used LearnSmart and can officially say ...,1
5,"Strong backgroung, good read, quite up to date...",1
6,If you live on Mars and never heard of the int...,1
7,i got this book on amazon and it ended up savi...,1
8,I was very happy with this purchase because th...,1
9,Recieved in a timely manner- book in great con...,1


In [27]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of reviews included in each cluster:


Unnamed: 0,cluster
1,11721
2,4470
0,2840
3,969


In [29]:
print ("<Document clustering result by K-means>")

#km.cluster_centers_ denotes the importances of each items in centroid.
#We need to sort it in decreasing-order and get the top k items.
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :15]: #replace 15 with n words per cluster
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()

<Document clustering result by K-means>
Cluster 0 words:game,play,love,fun,kid,old,great,like,enjoy,nanci,year,realli,son,daughter,learn,
Cluster 1 words:use,product,program,great,work,good,version,like,year,learn,time,easi,need,money,tri,
Cluster 2 words:window,instal,xp,work,comput,run,norton,problem,use,program,os,product,version,upgrad,new,
Cluster 3 words:offic,microsoft,ms,use,word,version,product,mac,work,suit,need,price,document,excel,instal,


## Part 5: Topic Modeling - Latent Dirichlet Allocation

In [32]:
# use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=4)

In [33]:
# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

(20000, 4)
[[0.07147118 0.06972759 0.07074116 0.78806008]
 [0.74220312 0.0834149  0.08248152 0.09190046]
 [0.07131468 0.07383415 0.42878606 0.4260651 ]
 ...
 [0.04964509 0.05021523 0.8504835  0.04965619]
 [0.03519834 0.03543648 0.89395471 0.03541046]
 [0.06134447 0.06662479 0.80463716 0.06739358]]


In [34]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(4, 823)
[[  7.88236373  28.41249286  24.59269452 ...   0.57593559 155.09717163
    8.97415583]
 [ 52.3217579   88.14501638  24.69236339 ... 255.5884704  139.16548093
   33.43575762]
 [  4.80072755  68.67631551  24.66072713 ... 158.6032158  181.17187693
   18.95834792]
 [  3.53573874  34.48563794   8.17681208 ...  10.42736163  33.58208316
    2.90854695]]


In [35]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,topic
Doc0,0.07,0.07,0.07,0.79,3
Doc1,0.74,0.08,0.08,0.09,0
Doc2,0.07,0.07,0.43,0.43,2
Doc3,0.07,0.77,0.07,0.08,1
Doc4,0.05,0.26,0.05,0.64,3
Doc5,0.07,0.07,0.07,0.79,3
Doc6,0.08,0.08,0.51,0.33,2
Doc7,0.07,0.07,0.07,0.79,3
Doc8,0.08,0.09,0.09,0.74,3
Doc9,0.07,0.08,0.2,0.64,3


In [36]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
1,7891
2,5540
0,3633
3,2936


In [38]:
# topic word matrix
print(lda.components_)
# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words.head()

[[  7.88236373  28.41249286  24.59269452 ...   0.57593559 155.09717163
    8.97415583]
 [ 52.3217579   88.14501638  24.69236339 ... 255.5884704  139.16548093
   33.43575762]
 [  4.80072755  68.67631551  24.66072713 ... 158.6032158  181.17187693
   18.95834792]
 [  3.53573874  34.48563794   8.17681208 ...  10.42736163  33.58208316
    2.90854695]]


Unnamed: 0,abil,abl,absolut,accept,access,account,act,activ,actual,ad,...,wors,worst,worth,write,written,wrong,x,xp,year,yes
Topic0,7.882364,28.412493,24.592695,0.731538,0.902446,0.257341,3.846632,41.714075,32.457517,6.677481,...,1.127633,3.763253,29.033741,9.002739,2.725581,12.195847,0.256062,0.575936,155.097172,8.974156
Topic1,52.321758,88.145016,24.692363,14.716832,70.653093,107.582229,24.316422,21.557142,63.244348,56.266838,...,18.597698,16.091537,96.161314,65.485065,23.324281,38.63213,83.340583,255.58847,139.165481,33.435758
Topic2,4.800728,68.676316,24.660727,20.526706,40.775252,19.38489,52.852066,94.350339,35.832019,32.105075,...,28.126479,43.153361,36.317152,17.826107,13.946979,35.069076,1.135813,158.603216,181.171877,18.958348
Topic3,3.535739,34.485638,8.176812,0.418583,1.768705,0.25602,0.254522,3.243508,23.786303,3.942515,...,0.264807,0.259682,20.096891,27.030548,14.692178,6.362802,0.944143,10.427362,33.582083,2.908547
