# Name: Sanika Vaidya
# Andrew ID: sanikav

## Imports

In [None]:
import json
import pandas as pd
from google.colab import drive
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
from gensim.utils import tokenize
from nltk.stem import WordNetLemmatizer
from gensim import models, similarities
from gensim.corpora.dictionary import Dictionary
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Data Loading

### Here are we are loading asins from metadata to a list. We are taking only certain product categories in this case which are closely related to each other, in this case products related to pizza.

In [None]:
# Expected time to run: 5 min
drive.mount("/content/drive", force_remount=True)

PATH = '/content/drive/MyDrive/amazon/meta_Home_and_Kitchen.json/meta_Home_and_Kitchen.json'
desired_categories = { "Pizza Cutters", "Pizza Pans & Stones", "Pizza Peels", "Countertop Pizza Ovens"}

try:
    asins = []
    with open(PATH, "rt", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line.strip())
            if any(c in obj.get("category", []) for c in desired_categories):
                asins.append(obj["asin"])
    df = pd.DataFrame({"asin": asins})
    print(f"Found {len(df)} ASINs with desired categories: {desired_categories}")
except FileNotFoundError:
    print("File not found")


Mounted at /content/drive
Found 1355 ASINs with desired categories: {'Countertop Pizza Ovens', 'Pizza Cutters', 'Pizza Peels', 'Pizza Pans & Stones'}


In [None]:
asins_set = set(asins)

In [None]:
asins_set

### Here we are taking data from reviews data where the asins match the asins pulled from the metadata.

In [None]:
# Expected time to run: 4 min 30 sec
PATH_review = '/content/drive/MyDrive/amazon/Home_and_Kitchen.json/Home_and_Kitchen.json'

review_texts = []
with open(PATH_review) as f:
    for line in f:
        review = json.loads(line)
        if review['asin'] in asins_set:
            review_texts.append(review)


In [None]:
review_texts

In [None]:
df = pd.DataFrame(review_texts)
df_reviews = df[["reviewText"]]
df_reviews.head()

Unnamed: 0,reviewText
0,"This is a sturdy, 2 3/4-inch pizza wheel, whic..."
1,"This is a sturdy, big, 4-inch pizza wheel, com..."
2,"This is a sturdy, 2 3/4-inch pizza wheel, whic..."
3,It is definitely not one of the best OXO produ...
4,I love all of Oxo's Good Grips utensils. They...


### Remove stop words

In [None]:
stopwords = set(stopwords.words('english'))

def removing_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

df_reviews['reviewText_clean'] = df_reviews['reviewText'].apply(lambda text: removing_stopwords(text))
df_reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['reviewText_clean'] = df_reviews['reviewText'].apply(lambda text: removing_stopwords(text))


Unnamed: 0,reviewText,reviewText_clean
0,"This is a sturdy, 2 3/4-inch pizza wheel, whic...","This sturdy, 2 3/4-inch pizza wheel, produces ..."
1,"This is a sturdy, big, 4-inch pizza wheel, com...","This sturdy, big, 4-inch pizza wheel, complete..."
2,"This is a sturdy, 2 3/4-inch pizza wheel, whic...","This sturdy, 2 3/4-inch pizza wheel, produces ..."
3,It is definitely not one of the best OXO produ...,It definitely one best OXO products. The one a...
4,I love all of Oxo's Good Grips utensils. They...,I love Oxo's Good Grips utensils. They excelle...
...,...,...
46078,This pan is great. I used to use straight pans...,"This pan great. I used use straight pans, coul..."
46079,It works as intended. The pizzas i buy fit in ...,It works intended. The pizzas buy fit perfectl...
46080,Used it moments after bringing it in the house...,Used moments bringing house delivery.....perfe...
46081,Pan was dammege and too small,Pan dammege small


## Get all reviews to lower case, then tokenize and lemmatize.

In [None]:
df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].str.lower()

df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].apply(lambda x: list(tokenize(x)))

lemmatizer = WordNetLemmatizer()

df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].apply(lambda x: list(tokenize(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['reviewText_clean']

In [None]:
df_reviews

Unnamed: 0,reviewText,reviewText_clean
0,"This is a sturdy, 2 3/4-inch pizza wheel, whic...","[this, sturdy, inch, pizza, wheel, produce, cl..."
1,"This is a sturdy, big, 4-inch pizza wheel, com...","[this, sturdy, big, inch, pizza, wheel, comple..."
2,"This is a sturdy, 2 3/4-inch pizza wheel, whic...","[this, sturdy, inch, pizza, wheel, produce, cl..."
3,It is definitely not one of the best OXO produ...,"[it, definitely, one, best, oxo, product, the,..."
4,I love all of Oxo's Good Grips utensils. They...,"[i, love, oxo, s, good, grip, utensil, they, e..."
...,...,...
46078,This pan is great. I used to use straight pans...,"[this, pan, great, i, used, use, straight, pan..."
46079,It works as intended. The pizzas i buy fit in ...,"[it, work, intended, the, pizza, buy, fit, per..."
46080,Used it moments after bringing it in the house...,"[used, moment, bringing, house, delivery, perf..."
46081,Pan was dammege and too small,"[pan, dammege, small]"


### Remove single and double letter words because most of them are "a", "it" and "us" etc

In [None]:
df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].apply(lambda lst: [s for s in lst if len(s) > 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['reviewText_clean'] = df_reviews['reviewText_clean'].apply(lambda lst: [s for s in lst if len(s) > 2])


## Create Dictionary and corpus.

In [None]:
dictionary = Dictionary(df_reviews['reviewText_clean'])
corpus = [dictionary.doc2bow(text) for text in df_reviews['reviewText_clean']]

## Create a LDA model and test different parameters.

In [None]:
num_topics = 20
model = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=4, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.47112892227369246

In [None]:
# Expected time to run: 5 min
num_topics = 20
model2 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=7, workers=4, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model2, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5104484483845962

In [None]:
# Expected time to run: 6 min
num_topics = 20
model3 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=4, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model3, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5165276703651449

In [None]:
# Expected time to run: 7 min
num_topics = 10
model4 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=4, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model4, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5054444052560514

In [None]:
# Expected time to run: 7 min
num_topics = 5
model4 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=4, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model4, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.49049587486934787

In [None]:
# Expected time to run: 4 min
num_topics = 20
model5 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=6, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model5, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5049064684744217

In [None]:
# Expected time to run: 4 min
num_topics = 20
model6 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=2, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model6, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5503422903109361

In [None]:
# Expected time to run: 3 min
num_topics = 20
model7 = models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, dtype=np.float64)

In [None]:
cm = CoherenceModel(model=model7, texts=df_reviews['reviewText_clean'], dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.5281542264977968

## Most salient words per topic.

In [None]:
for ix in range(num_topics):
  top10 = np.argsort(model6.get_topics()[ix])[-10:]
  print(f'{ix}:  {" ".join([dictionary[index] for index in top10])}')

0:  cast iron this make pizza great use clean love easy
1:  larger better size bit much smaller the one screen inch
2:  class for hook described product link normal stainless exactly steel
3:  pizza very nice price product well quality good work great
4:  onto crust get slide make peel oven stone dough pizza
5:  purchased buy another old this love pizza bought year one
6:  this made make used crust the baking bread pizza stone
7:  first plastic back like time would one use get the
8:  amazon would fast excellent quality arrived item received review product
9:  used smell cooking food foreman waffle ceramic the plate grill
10:  half used use cracked time chef first piece broke money
11:  ever one dish deep like this best home make pizza
12:  crisp this non crispy bottom the stick crust pizza pan
13:  use toaster work perfectly great fit size perfect oven pizza
14:  handle the this cutting wheel sharp blade cut cutter pizza
15:  use cooking the get minute pizza top cook heat oven
16:  ge

In [None]:
model6.get_topics()[0]

array([1.33734247e-06, 1.33728410e-06, 1.33728411e-06, ...,
       1.33728410e-06, 1.33728410e-06, 1.33728410e-06])

## Dump modle into a pickle file.

In [None]:
with open('topic.model', 'wb') as f:
    pickle.dump(model6,f)