In [8]:
import struct
import gzip
import numpy as np
import pandas as pd
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [9]:
import random

# Open the JSON file and read the data in chunks
json_reader = pd.read_json('downloads/amazon/Home_and_Kitchen.json', lines=True, chunksize=10000)

# Create an empty list to store the sampled chunks
sampled_chunks = []

# Loop through the chunks and randomly select a subset
for chunk in json_reader:
    # Calculate the number of rows to sample
    num_rows = int(len(chunk) * 0.01)
    
    # If the number of rows to sample is greater than the chunk size, set it to the chunk size
    if num_rows >= len(chunk):
        num_rows = len(chunk)
    
    # Randomly select the rows
    random_indices = random.sample(range(len(chunk)), num_rows)
    
    # Append the sampled rows to the list
    sampled_chunks.append(chunk.iloc[random_indices])
    
# Concatenate the sampled chunks into a single dataframe
data = pd.concat(sampled_chunks, ignore_index=True)

In [10]:
# define stopwords and punctuation to remove
import string

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

In [11]:
# define lemmatizer
lemma = WordNetLemmatizer()

In [12]:
# preprocessing function
def preprocess(text):
    # lowercase text
    text = text.lower()
    # remove punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # remove stopwords and words with length less than 3
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    # lemmatize words
    tokens = [lemma.lemmatize(word) for word in tokens]
    return tokens


In [13]:
#download stopwords, wordnet, punkt
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanvimurke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tanvimurke/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tanvimurke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
documents = []
for index, row in data.iterrows():
    if 'reviewText' in row and isinstance(row['reviewText'], str):
        documents.append(preprocess(row['reviewText']))


In [17]:
# create dictionary and bag of words
dictionary = Dictionary(documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [18]:
# train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=10, alpha='auto', eta='auto')


In [19]:
# evaluate coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score (LDA):', coherence_lda)

Coherence Score (LDA): 0.5357118632614479


In [20]:
# print topics and their top words
for index, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(index, topic))

Topic: 0 
Words: 0.027*"little" + 0.023*"fit" + 0.020*"put" + 0.019*"small" + 0.018*"cute" + 0.014*"piece" + 0.014*"need" + 0.013*"hold" + 0.012*"top" + 0.012*"doesnt"
Topic: 1 
Words: 0.022*"unit" + 0.020*"blanket" + 0.017*"heat" + 0.016*"instruction" + 0.014*"turn" + 0.012*"low" + 0.010*"name" + 0.010*"oven" + 0.009*"feature" + 0.009*"minute"
Topic: 2 
Words: 0.067*"one" + 0.034*"time" + 0.022*"get" + 0.021*"bought" + 0.020*"used" + 0.017*"year" + 0.016*"two" + 0.015*"still" + 0.015*"day" + 0.015*"pillow"
Topic: 3 
Words: 0.026*"would" + 0.018*"like" + 0.017*"gift" + 0.016*"item" + 0.015*"picture" + 0.014*"got" + 0.013*"came" + 0.013*"set" + 0.011*"even" + 0.011*"received"
Topic: 4 
Words: 0.028*"cut" + 0.025*"food" + 0.021*"bright" + 0.021*"container" + 0.021*"bowl" + 0.019*"pictured" + 0.015*"oil" + 0.015*"steel" + 0.015*"blade" + 0.014*"jar"
Topic: 5 
Words: 0.060*"glass" + 0.051*"cup" + 0.051*"water" + 0.040*"lid" + 0.040*"coffee" + 0.033*"mug" + 0.033*"bottle" + 0.031*"hot" + 0.

In [21]:
#train other LDA model
lda_model2 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10, alpha='auto', eta='auto')


In [22]:
# evaluate coherence score
coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda2 = coherence_model_lda2.get_coherence()
print('Coherence Score (LDA):', coherence_lda2)

Coherence Score (LDA): 0.48086689658163817


In [23]:
# print topics and their top words
for index, topic in lda_model2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(index, topic))

Topic: 0 
Words: 0.072*"curtain" + 0.059*"pan" + 0.045*"shower" + 0.032*"steel" + 0.030*"pot" + 0.028*"oil" + 0.027*"cake" + 0.026*"stick" + 0.025*"stainless" + 0.025*"cooking"
Topic: 1 
Words: 0.071*"cup" + 0.071*"water" + 0.055*"coffee" + 0.046*"mug" + 0.043*"hot" + 0.031*"tea" + 0.027*"drink" + 0.026*"cold" + 0.024*"pictured" + 0.024*"filter"
Topic: 2 
Words: 0.106*"review" + 0.035*"read" + 0.031*"instruction" + 0.025*"birthday" + 0.022*"discount" + 0.021*"included" + 0.021*"shown" + 0.019*"opinion" + 0.019*"tray" + 0.018*"display"
Topic: 3 
Words: 0.073*"money" + 0.070*"worth" + 0.056*"smell" + 0.047*"photo" + 0.042*"bad" + 0.033*"nothing" + 0.028*"waste" + 0.027*"assemble" + 0.025*"pay" + 0.022*"wrong"
Topic: 4 
Words: 0.166*"put" + 0.149*"cute" + 0.087*"table" + 0.074*"together" + 0.061*"awesome" + 0.044*"kid" + 0.033*"lovely" + 0.025*"adorable" + 0.021*"putting" + 0.018*"compliment"
Topic: 5 
Words: 0.189*"beautiful" + 0.095*"arrived" + 0.076*"bag" + 0.052*"frame" + 0.049*"pleas

In [24]:
#train other LDA model
lda_model3 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=5, alpha='auto', eta='auto')


In [25]:
# evaluate coherence score
coherence_model_lda3 = CoherenceModel(model=lda_model3, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda3 = coherence_model_lda3.get_coherence()
print('Coherence Score (LDA):', coherence_lda3)

Coherence Score (LDA): 0.4694422209436747


In [26]:
# print topics and their top words
for index, topic in lda_model3.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(index, topic))

Topic: 0 
Words: 0.094*"cup" + 0.094*"water" + 0.073*"coffee" + 0.062*"mug" + 0.056*"hot" + 0.041*"tea" + 0.036*"wine" + 0.029*"drink" + 0.025*"filter" + 0.023*"machine"
Topic: 1 
Words: 0.048*"black" + 0.039*"show" + 0.039*"bigger" + 0.036*"plate" + 0.029*"pattern" + 0.027*"true" + 0.023*"dark" + 0.022*"okay" + 0.022*"spot" + 0.021*"brown"
Topic: 2 
Words: 0.219*"expected" + 0.053*"heat" + 0.034*"zipper" + 0.032*"burn" + 0.031*"temperature" + 0.028*"grinder" + 0.022*"bath" + 0.019*"memory" + 0.019*"guy" + 0.019*"lol"
Topic: 3 
Words: 0.238*"perfect" + 0.138*"gift" + 0.093*"loved" + 0.068*"sheet" + 0.057*"daughter" + 0.049*"christmas" + 0.042*"friend" + 0.041*"son" + 0.037*"husband" + 0.034*"cool"
Topic: 4 
Words: 0.162*"good" + 0.162*"nice" + 0.150*"quality" + 0.094*"price" + 0.057*"pretty" + 0.026*"sturdy" + 0.025*"high" + 0.024*"pleased" + 0.023*"fast" + 0.020*"shipping"
Topic: 5 
Words: 0.024*"get" + 0.022*"time" + 0.014*"even" + 0.013*"back" + 0.013*"dont" + 0.011*"didnt" + 0.010*

In [27]:
#pickle
import pickle

with open("Downloads/topic.model.pkl", "wb") as file:
    pickle.dump(lda_model, file)