# Data Preparation 

## Import Packages and Data Summary

In [2]:
import logging
from re import sub
from time import time 
import numpy as np
import pandas as pd
import multiprocessing
#from unidecode import unidecode
from collections import defaultdict
from googletrans import Translator, constants

# Packages for data preprocessing
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# Gensim packages
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models.phrases import Phrases, Phraser

# Packages for modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans

In [2]:
# Raw File
file_path = "../raw_data/amazon.csv"
df = pd.read_csv(file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [4]:
# Cleaned dataframe with just reviews
file_path = "../raw_data/reviews_cleaned.csv"
df_reviews= pd.read_csv(file_path)
len(df_reviews)

11012

In [5]:
df_reviews = df_reviews.dropna()
len(df_reviews.dropna())

11008

In [6]:
# Dataframe with reviews and sentiment score
file_path_sentiments = "../raw_data/reviews_analyzed.csv"
df_sentiment = pd.read_csv(file_path_sentiments)
df_sentiment.head()

Unnamed: 0,review_content,Sentiment
0,Looks durable Charging is fine tooNo complains,4
1,"Charging is really fast, good product.",4
2,Till now satisfied with the quality.,4
3,This is a good product . The charging speed is...,4
4,"Good quality, would recommend",4


In [7]:
len(df_sentiment)

11008

In [8]:
df_sentiment.groupby('Sentiment').count()

Unnamed: 0_level_0,review_content
Sentiment,Unnamed: 1_level_1
1,751
2,773
3,1995
4,4135
5,3354


## Add Categories and Reviews to a Dataframe

In [9]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [10]:
df.groupby('category').count()

Unnamed: 0_level_0,product_id,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Car&Motorbike|CarAccessories|InteriorAccessories|AirPurifiers&Ionizers,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Computers&Accessories|Accessories&Peripherals|Adapters|USBtoUSBAdapters,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCHeadsets,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCMicrophones,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCSpeakers,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"OfficeProducts|OfficePaperProducts|Paper|Stationery|Pens,Pencils&WritingSupplies|Pens&Refills|GelInkRollerballPens",2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
"OfficeProducts|OfficePaperProducts|Paper|Stationery|Pens,Pencils&WritingSupplies|Pens&Refills|LiquidInkRollerballPens",2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
"OfficeProducts|OfficePaperProducts|Paper|Stationery|Pens,Pencils&WritingSupplies|Pens&Refills|RetractableBallpointPens",2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
"OfficeProducts|OfficePaperProducts|Paper|Stationery|Pens,Pencils&WritingSupplies|Pens&Refills|StickBallpointPens",3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


In [11]:
len_df = len(df)

In [12]:
# Create new df with category, 
df_split = df.iloc[0:len_df, ]
df_split

# Split the category column into multiple columns 
# df_test[['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10', 'cat_11']] = df_test['category'].str.split('|', expand=True)
df_split['category'] = df_split['category'].str.split('|')
max_splits = df_split['category'].apply(len).max()
max_splits

7

In [13]:
# Split columns up to max range
split_columns = pd.DataFrame(df_split['category'].tolist(), columns=[f'Split_{i+1}' for i in range(max_splits)])

# Concatinate the splits with the original cdf
result_df = pd.concat([df_split, split_columns], axis=1)

In [14]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
 16  Split_

In [15]:
# Slice df to only keep 'product_id', 'product_name', 'category', 'discounted_price','actual_price', 'discount_percentage', 'rating', 'rating_count','about_product', 'review_content']
df_final = result_df.drop(columns=['user_id', 'user_name', 'review_id', 'img_link', 'product_link'])
df_final.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,review_title,review_content,Split_1,Split_2,Split_3,Split_4,Split_5,Split_6,Split_7
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ199,‚Çπ349,43%,4.0,43994,"Compatible with all Type C enabled devices, be...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ199,"‚Çπ1,899",90%,3.9,7928,„Äê Fast Charger& Data Sync„Äë-With built-in safet...,"Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ329,‚Çπ699,53%,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ154,‚Çπ399,61%,4.2,16905,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,


In [16]:
# Split review content column 
df_final["review_content"] = df_final["review_content"].str.split(",(?!\s)", expand=False)
df_final = df_final.explode("review_content")

In [17]:
df_final

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,review_title,review_content,Split_1,Split_2,Split_3,Split_4,Split_5,Split_6,Split_7
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...","Charging is really fast, good product.",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...",Till now satisfied with the quality.,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...",This is a good product . The charging speed is...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"[Computers&Accessories, Accessories&Peripheral...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"Satisfied,Charging is really fast,Value for mo...","Good quality, would recommend",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1464,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,"[Home&Kitchen, Kitchen&HomeAppliances, SmallKi...","‚Çπ2,863","‚Çπ3,690",22%,4.3,6987,"Brand-Borosil, Specification √¢‚Ç¨‚Äú 23V ~ 5Hz;1 W...","Works perfect,Ok good product,Nice Product. Re...",Very good product,Home&Kitchen,Kitchen&HomeAppliances,SmallKitchenAppliances,SandwichMakers,,,
1464,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,"[Home&Kitchen, Kitchen&HomeAppliances, SmallKi...","‚Çπ2,863","‚Çπ3,690",22%,4.3,6987,"Brand-Borosil, Specification √¢‚Ç¨‚Äú 23V ~ 5Hz;1 W...","Works perfect,Ok good product,Nice Product. Re...","This is a pretty powerful sandwich maker, for ...",Home&Kitchen,Kitchen&HomeAppliances,SmallKitchenAppliances,SandwichMakers,,,
1464,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,"[Home&Kitchen, Kitchen&HomeAppliances, SmallKi...","‚Çπ2,863","‚Çπ3,690",22%,4.3,6987,"Brand-Borosil, Specification √¢‚Ç¨‚Äú 23V ~ 5Hz;1 W...","Works perfect,Ok good product,Nice Product. Re...","‡§¨‡•ã‡§∞‡•ã‡§∏‡§ø‡§≤ ‡§¨‡•ç‡§∞‡§æ‡§Ç‡§° ‡§ï‡§æ ‡§Ø‡§π ""‡§∏‡•á‡§Ç‡§°‡§µ‡§ø‡§ö ‡§Æ‡•á‡§ï‡§∞"" ‡§¶‡•á‡§ñ‡§®‡•á ‡§Æ‡•á‡§Ç ...",Home&Kitchen,Kitchen&HomeAppliances,SmallKitchenAppliances,SandwichMakers,,,
1464,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,"[Home&Kitchen, Kitchen&HomeAppliances, SmallKi...","‚Çπ2,863","‚Çπ3,690",22%,4.3,6987,"Brand-Borosil, Specification √¢‚Ç¨‚Äú 23V ~ 5Hz;1 W...","Works perfect,Ok good product,Nice Product. Re...",Recommend work as expected,Home&Kitchen,Kitchen&HomeAppliances,SmallKitchenAppliances,SandwichMakers,,,


In [18]:
df_final.to_csv('df_cat_split.csv')

In [19]:
df_final_1 = df_final.dropna(subset=['review_content'])
len(df_final_1)

12138

In [20]:
df_final['review_content']

0          Looks durable Charging is fine tooNo complains
0                  Charging is really fast, good product.
0                    Till now satisfied with the quality.
0       This is a good product . The charging speed is...
0                           Good quality, would recommend
                              ...                        
1464                                    Very good product
1464    This is a pretty powerful sandwich maker, for ...
1464    ‡§¨‡•ã‡§∞‡•ã‡§∏‡§ø‡§≤ ‡§¨‡•ç‡§∞‡§æ‡§Ç‡§° ‡§ï‡§æ ‡§Ø‡§π "‡§∏‡•á‡§Ç‡§°‡§µ‡§ø‡§ö ‡§Æ‡•á‡§ï‡§∞" ‡§¶‡•á‡§ñ‡§®‡•á ‡§Æ‡•á‡§Ç ...
1464                           Recommend work as expected
1464                                      Its easy tp use
Name: review_content, Length: 12138, dtype: object

In [20]:
# for index, row in df_test_1.iterrows():
    # print(row['review_content'])

In [21]:
review_content_clean.shape

NameError: name 'review_content_clean' is not defined

## Translate Reviews and Additional Text Cleaning

In [21]:
#!pip3 install googletrans==3.1.0a0
#!pip install demoji
#!pip install langdetect

from googletrans import Translator, constants
import demoji
from langdetect import detect

In [22]:
translator = Translator()

In [24]:
review_content = df_final['review_content']

In [25]:
def translate_text(text):
    translator = Translator()
    translated = translator.translate(text, src="auto", dest="en")
    return translated.text

In [26]:
def translate_if_hindi(text):
    try:
        if detect(text) == 'hi':
            translated_text = translate_text(text)
            return translated_text
    except Exception as e:
        print(e)
        print(text)
    return text

In [27]:
batch_size = 100

for i in range(0, len(df_final), batch_size):
    batch = df_final['review_content'].iloc[i:i+batch_size].tolist()
    translated_batch = [translate_if_hindi(text) for text in batch]
    df_final['review_content'].iloc[i:i+batch_size] = translated_batch

No features in text.
https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/81---F1ZgHL._SY88.jpg
No features in text.
https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/71rIggrbUCL._SY88.jpg
No features in text.
https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/61bKp9YO6wL._SY88.jpg
No features in text.

No features in text.
https://m.media-amazon.com/images/I/51112ZRE-1L._SY88.jpg
No features in text.
-
[Errno -3] Temporary failure in name resolution
1.5‡§è‡§Æ ‡§ï‡§æ ‡§°‡§æ‡§ü‡§æ ‡§ï‡•á‡§¨‡§≤ ‡§Æ‡•á‡§∞‡•á ‡§≤‡§ø‡§è ‡§¨‡§π‡•Å‡§§ ‡§π‡•Ä ‡§≤‡§æ‡§≠‡§¶‡§æ‡§Ø‡§ï ‡§π‡•à ‡•§  ‡§Æ‡•à‡§Ç ‡§á‡§∏ ‡§ï‡•á‡§¨‡§≤ ‡§ï‡•ã ‡§≤‡§æ‡§á‡§® ‡§Æ‡•á‡§Ç ‡§ö‡§æ‡§∞‡•ç‡§ú ‡§ï‡§∞‡§§‡•á ‡§∏‡§Æ‡§Ø ‡§´‡•ã‡§® ‡§¨‡§π‡•Å‡§§ ‡§Ü‡§∞‡§æ‡§Æ ‡§∏‡•á ‡§â‡§™‡§Ø‡•ã‡§ó  ‡§ï‡§∞ ‡§™‡§æ ‡§∞‡§π‡§æ ‡§π‡•Å ‡•§ ‡§Ü‡§™ ‡§á‡§∏ ‡§ï‡•á‡§¨‡§≤ ‡§∏‡•á 15watt ‡§ï‡§æ  ‡§ö‡§æ‡§∞‡•ç‡§ú‡§∞ ‡§â‡§∏‡•á ‡§ï‡§∞ ‡§∏‡§ï‡§§‡•á ‡§π‡•à (‡§á‡§∏‡§∏‡•á ‡§ú‡§°‡§º‡§æ ‡§®‡§π‡•Ä) ‡•§ ‡§≤‡•ã‡§ï‡§≤ ‡§¨‡§æ‡§ú‡§æ‡§∞ ‡§Æ‡•á‡§Ç ‡§á‡§∏‡§ï‡§æ ‡§¶‡§æ

KeyboardInterrupt: 

In [None]:
len(df_final['review_content'])

In [None]:
def has_emoji(text):
    return bool(demoji.findall(text))

In [None]:
# Drop reviews that are not needed (https, empty strings, emojis)
review_content_clean = df_final[~df_final["review_content"].str.contains("https:")]
review_content_clean_1 = review_content_clean[~review_content_clean["review_content"].eq("")]
review_content_clean_2 = review_content_clean_1[~review_content_clean_1["review_content"].apply(has_emoji)]
review_content_clean_2.reset_index(drop=True, inplace=True)
print(len(review_content_clean))
print(len(review_content_clean_1))
print(len(review_content_clean_2))

In [None]:
#review_content_clean = review_content_clean.reset_index(drop=True)

In [173]:
review_content_clean_translated = review_content_clean_2
review_content_clean_translated.tail()
review_content_clean_translated.to_csv('review_clean_translated_cat_split.csv')

## ML Standard Text Preprocessing 

In [5]:
df_reviews.head()

Unnamed: 0,review_content
0,Looks durable Charging is fine tooNo complains
1,"Charging is really fast, good product."
2,Till now satisfied with the quality.
3,This is a good product . The charging speed is...
4,"Good quality, would recommend"


In [6]:
#review_str = [x for x in df_reviews['review_content'] if type(x) == str]
#review_str[:10]
df_reviews_dropna = df_reviews.dropna()

In [7]:
def cleaning_ml_old(sentence):
    
    sentence = sentence.strip() # remove whitespaces
    sentence = sentence.lower() # lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) # remove numbers
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized) # formed back the sentences
    
    return cleaned_sentence

In [8]:
review_content_clean_old = df_reviews_dropna['review_content'].apply(cleaning_ml_old)
review_content_clean_old.head()

0              look durable charge fine toono complain
1                      charge really fast good product
2                                 till satisfy quality
3    good product charge speed slower original ipho...
4                         good quality would recommend
Name: review_content, dtype: object

In [9]:
def cleaning_ml(sentence):
    sentence = sentence.strip() # remove whitespaces
    sentence = sentence.lower() # lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) # remove numbers
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english')) ## define stopwords
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]
    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in tokenized_sentence_cleaned
    ]
    return lemmatized  # Return a list of tokenized words

In [10]:
#review_content_clean_df["review_content_clean"] = review_content_clean_df['review_content'].apply(lambda x:cleaning_ml(x))

review_content_clean = df_reviews_dropna['review_content'].apply(cleaning_ml)
review_content_clean.head()

0       [look, durable, charge, fine, toono, complain]
1                [charge, really, fast, good, product]
2                             [till, satisfy, quality]
3    [good, product, charge, speed, slower, origina...
4                    [good, quality, would, recommend]
Name: review_content, dtype: object

In [11]:
type(review_content_clean_old)

pandas.core.series.Series

# Tfidf Vectorizing

In [10]:
def cleaning_ml_full(sentence):
    
    sentence = sentence.strip() # remove whitespaces
    sentence = sentence.lower() # lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) # remove numbers
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized) # formed back the sentences
    
    return cleaned_sentence

In [24]:
# Test slicing df with sentiment 4 (to series)
df_sentiment_4 = pd.Series(df_sentiment[df_sentiment['Sentiment'] == 4]['review_content'])
df_sentiment_4

0           Looks durable Charging is fine tooNo complains
1                   Charging is really fast, good product.
2                     Till now satisfied with the quality.
3        This is a good product . The charging speed is...
4                            Good quality, would recommend
                               ...                        
10992                                         Nice product
10993                                         Nice product
11000    It does it job perfectly..only issue is temp c...
11006                           Recommend work as expected
11007                                      Its easy tp use
Name: review_content, Length: 4135, dtype: object

In [23]:
vectorizer = TfidfVectorizer()

vectorized_reviews = vectorizer.fit_transform(df_sentiment_4)

vectorized_reviews = pd.DataFrame(
    vectorized_reviews.toarray(), 
    columns = vectorizer.get_feature_names_out()
)

vectorized_reviews

Unnamed: 0,000,01,02,04,06,07,0enjoy,10,100,1000,...,‡Æ®‡Æ©,‡Æ©‡Æ§,‡Æ≥‡Æ§,ùóîùó±ùóµùó≤ùòÄùó∂ùóºùóª,ùóïùòÇùó∂ùóπùó±,ùó§ùòÇùóÆùóπùó∂ùòÅùòÜ,ùó¶ùóºùóπùòÇùòÅùó∂ùóºùóª,ùó≥ùóºùóø,ùü±i,ùü±you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modelling: Latent Dirichlet Allocation

In [32]:
# Instantiate the LDA 
n_components = 3
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_reviews)

In [33]:
# Transform LDA
reviews_topic = lda_model.transform(vectorized_reviews)

In [34]:
topic_list = pd.DataFrame(
    lda_model.components_, 
    columns = vectorizer.get_feature_names_out()
)

topic_list

Unnamed: 0,000,01,02,04,06,07,0enjoy,10,100,1000,...,‡Æ®‡Æ©,‡Æ©‡Æ§,‡Æ≥‡Æ§,ùóîùó±ùóµùó≤ùòÄùó∂ùóºùóª,ùóïùòÇùó∂ùóπùó±,ùó§ùòÇùóÆùóπùó∂ùòÅùòÜ,ùó¶ùóºùóπùòÇùòÅùó∂ùóºùóª,ùó≥ùóºùóø,ùü±i,ùü±you
0,0.334233,0.334049,0.350656,0.355512,0.334609,0.355512,0.335254,0.335833,0.337593,0.336909,...,0.338177,0.338177,0.338177,0.334832,0.33465,0.33465,0.33465,0.33465,0.33465,0.33465
1,0.333856,0.333831,0.33419,0.333557,0.334133,0.333557,0.334144,0.338236,0.353054,0.336121,...,0.336096,0.336096,0.336096,0.334279,0.334165,0.334165,0.334165,0.334165,0.334165,0.334165
2,0.395276,0.38124,0.427127,0.348213,0.414822,0.348213,0.410809,7.250108,5.626141,0.885004,...,1.27441,1.27441,1.27441,0.568265,0.449874,0.449874,0.449874,0.449874,0.449874,0.449874


In [41]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)
        
        print(round(topic_df,3))

In [36]:
print_topics(lda_model, vectorizer, 5)

----------
For topic 0, here are the the top 5 words with weights:
print       5.029
finger      4.988
microusb    3.799
speedy      3.740
simpla      3.546
Name: 0, dtype: float64
----------
For topic 1, here are the the top 5 words with weights:
good       717.210
product    353.008
nice       296.098
one         39.699
working     38.009
Name: 1, dtype: float64
----------
For topic 2, here are the the top 5 words with weights:
is     228.017
the    219.017
it     195.684
for    168.579
and    167.686
Name: 2, dtype: float64


# Function to show topics

In [23]:
# Read master df
file_path_cat = "../raw_data/review_clean_translated_cat_split.csv"
df_cat = pd.read_csv(file_path_cat)

In [47]:
len(df_cat)

11022

In [25]:
df_cat = df_cat.dropna(subset=['review_content'])

In [26]:
df_cat.to_csv('reviews_df_20230926.csv')

In [27]:
len(df_cat)

11018

In [15]:
df_cat_split_5 = df_cat.groupby('Split_5').count()
df_cat_split_5
len(df_cat_split_5)

56

In [16]:
df_cat_split_1 = df_cat.groupby('Split_1').count()
df_cat_split_1
len(df_cat_split_1)

9

In [17]:
df_cat_split_6 = df_cat.groupby('Split_6').count()
df_cat_split_6
len(df_cat_split_6)

14

In [18]:
df_cat_split_5_dropna = df_cat.dropna('

SyntaxError: unterminated string literal (detected at line 1) (3808102536.py, line 1)

In [19]:
product_list = []

for index, row in df_cat.iterrows():
    if not pd.isna(row['Split_7']):
        product_list.append(row['Split_7'])
    elif not pd.isna(row['Split_6']):
        product_list.append(row['Split_6'])
    elif not pd.isna(row['Split_5']):
        product_list.append(row['Split_5'])
    else:
        product_list.append(row['Split_4'])
    
df_cat['product_category'] = product_list
    

In [20]:
from collections import OrderedDict

product_unique_list = list(OrderedDict.fromkeys(product_list))
len(product_unique_list)

173

In [21]:
product_unique_list

['USBCables',
 'WirelessUSBAdapters',
 'HDMICables',
 'SmartTelevisions',
 'RemoteControls',
 'StandardTelevisions',
 'TVWall&CeilingMounts',
 'RCACables',
 'Mounts',
 'OpticalCables',
 nan,
 'Adapters',
 'SatelliteReceivers',
 'DVICables',
 'SpeakerCables',
 'StreamingClients',
 'TowerSpeakers',
 '3DGlasses',
 'PowerBanks',
 'Smartphones',
 'MicroSD',
 'BasicMobiles',
 'In-Ear',
 'AutomobileChargers',
 'Cradles',
 'WallChargers',
 'OTGAdapters',
 'Tripods',
 'SelfieSticks',
 'Stands',
 'CableConnectionProtectors',
 'D√©cor',
 'ScreenProtectors',
 'StylusPens',
 'Bedstand&DeskMounts',
 'BasicCases',
 'HandlebarMounts',
 'On-Ear',
 'CameraPrivacyCovers',
 'PhoneCharms',
 'Shower&WallMounts',
 'Mice',
 'GraphicTablets',
 'Lapdesks',
 'NotebookComputerStands',
 'Keyboards',
 'GelInkRollerballPens',
 'Tape',
 'Keyboard&MouseSets',
 'Tabletop&TravelTripods',
 'Scientific',
 'TripodLegs',
 'InkjetInkCartridges',
 'DustCovers',
 'GamingMice',
 'Paints',
 'MousePads',
 'Macro&RinglightFlashes'

In [64]:
df_hist = df_cat.groupby('product_category')['product_id'].count().sort_values(ascending=False)

In [71]:
df_hist[:50]

product_category
USBCables                      1729
Smartphones                     504
SmartTelevisions                482
In-Ear                          442
RemoteControls                  366
MixerGrinders                   206
DryIrons                        193
Mice                            187
HDMICables                      174
InstantWaterHeaters             165
LintShavers                     149
WirelessUSBAdapters             143
ElectricHeaters                 137
FanHeaters                      135
ElectricKettles                 134
HandBlenders                    132
Lapdesks                        121
WallChargers                    110
SteamIrons                      104
Kettle&ToasterSets              103
MicroSD                         101
StorageWaterHeaters              97
LaundryBaskets                   94
ScreenProtectors                 91
WaterPurifierAccessories         90
PowerBanks                       89
SandwichMakers                   88
WaterFilter

In [26]:
# Test add sentiment column
df_cat['sentiment'] = '3'

In [6]:
# df with sentiments
file_path_final = "../raw_data/reviews_analyzed_cat_sentiment.csv"
df_final_sent = pd.read_csv(file_path_final)

In [30]:
df = df_cat.loc[(df_cat['product_category'] == 'USBCables') & (df_cat['sentiment'] == 3)]

In [35]:
filtered_df = df_cat.loc[(df_cat['product_category'] == 'USBCables') & (df_cat['sentiment'] == '3')]
filtered_df

Unnamed: 0.1,Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,...,review_content,Split_1,Split_2,Split_3,Split_4,Split_5,Split_6,Split_7,product_category,sentiment
0,0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,...,Looks durable Charging is fine tooNo complains,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
1,1,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,...,"Charging is really fast, good product.",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
2,2,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,...,Till now satisfied with the quality.,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
3,3,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,...,This is a good product . The charging speed is...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
4,4,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ399,"‚Çπ1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,...,"Good quality, would recommend",Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7665,7665,B0BMXMLSMM,Lapster 65W compatible for OnePlus Dash Warp C...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ199,‚Çπ999,80%,4.5,127,-1 meter type c to c cable fast charging cable...,...,Cable is good and support 65 w fast charging. ...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
7666,7666,B0BMXMLSMM,Lapster 65W compatible for OnePlus Dash Warp C...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ199,‚Çπ999,80%,4.5,127,-1 meter type c to c cable fast charging cable...,...,The product worked fine for me. Got this at g...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
7667,7667,B0BMXMLSMM,Lapster 65W compatible for OnePlus Dash Warp C...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ199,‚Çπ999,80%,4.5,127,-1 meter type c to c cable fast charging cable...,...,Flexible and toughness is best workAnd also in...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3
7668,7668,B0BMXMLSMM,Lapster 65W compatible for OnePlus Dash Warp C...,"['Computers&Accessories', 'Accessories&Periphe...",‚Çπ199,‚Çπ999,80%,4.5,127,-1 meter type c to c cable fast charging cable...,...,I bought this cable for my nothing phone 1 it'...,Computers&Accessories,Accessories&Peripherals,Cables&Accessories,Cables,USBCables,,,USBCables,3


In [39]:
for i in product_unique_list:
    for j in range(1,5,1):
        df = df_cat.loc[(df_cat['product_category'] == i) & (df_cat['sentiment'] == j)]
        lda_model_apply(df, 5)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [38]:
def lda_model_apply(df, n_components):

    df.dropna()
    review_content = df['review_content'].apply(cleaning_ml_full)
    
    # Instantiate the tfidf vectorizer 
    vectorizer = TfidfVectorizer()

    vectorized_reviews = vectorizer.fit_transform(review_content)
    
    vectorized_reviews = pd.DataFrame(
        vectorized_reviews.toarray(), 
        columns = vectorizer.get_feature_names_out()
    )
        
    # Instantiate the LDA 
    lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)
    
    # Fit the LDA on the vectorized documents
    lda_model.fit(vectorized_reviews)
    
    # Transform LDA
    reviews_topic = lda_model.transform(vectorized_reviews)
    
    topic_list = pd.DataFrame(
        lda_model.components_, 
        columns = vectorizer.get_feature_names_out()
    )
    
    topic_list
    
    print_topics(lda_model, vectorizer, 5)

In [42]:
df = df_cat.loc[(df_cat['product_category'] == 'USBCables') & (df_cat['sentiment'] == '3')]
lda_model_apply(df, 5)

----------
For topic 0, here are the the top 5 words with weights:
nice       69.544
product    40.091
charge     39.519
good       30.903
value      30.540
Name: 0, dtype: float64
----------
For topic 1, here are the the top 5 words with weights:
cable     20.853
issue     13.872
charge    12.438
use       12.106
work      10.264
Name: 1, dtype: float64
----------
For topic 2, here are the the top 5 words with weights:
work     36.626
worth    24.485
fine     22.154
ok       21.850
buy      19.150
Name: 2, dtype: float64
----------
For topic 3, here are the the top 5 words with weights:
good       178.577
product     75.133
quality     29.054
like        26.960
charge      23.365
Name: 3, dtype: float64
----------
For topic 4, here are the the top 5 words with weights:
best         29.295
charge       21.782
long         20.587
cable        19.170
recommend    15.198
Name: 4, dtype: float64


# Key Insights from Summarizer 

In [3]:
# Raw File
file_path = "../raw_data/summary_keywords_df.csv"
df_summary = pd.read_csv(file_path)

In [5]:
df_summary.head()

Unnamed: 0,product_id,product_name,product_category,summary,keywords,sentiment
0,B002PD61Y4,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,WirelessUSBAdapters,Good quality tool from d linkWiFi signal is go...,"['dvr works', 'wifi supporting', 'jio wifi', '...",4
1,B002SZEOLG,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,WirelessUSBAdapters,The wifi dongle is a simple plug & play device...,"['usb tethering', 'best adapter', 'wifi dongle...",4
2,B003B00484,Duracell Plus AAA Rechargeable Batteries (750 ...,RechargeableBatteries,Soldering the connections was bit tricky but i...,"['trimmer battery', 'qt4005 trimmer', 'expensi...",4
3,B003L62T7W,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...",Mice,The best thing about this mouse is that u can ...,"['mouse quality', 'best mouse', 'mouse budget'...",5
4,B004IO5BMQ,"Logitech M235 Wireless Mouse, 1000 DPI Optical...",Mice,"Good product, but too smaller than the regular...","['sized battery', 'mouse easy', 'mouse', 'mous...",4


In [11]:
df_summary.groupby('product_category').count().sort_values(by='product_id', ascending=False)

Unnamed: 0_level_0,product_id,product_name,summary,keywords,sentiment
product_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
USBCables,161,161,161,161,161
Smartphones,68,68,68,68,68
SmartWatches,62,62,62,62,62
SmartTelevisions,60,60,60,60,60
In-Ear,51,51,51,51,51
...,...,...,...,...,...
InkjetInkRefills&Kits,1,1,1,1,1
HandlebarMounts,1,1,1,1,1
HandheldBags,1,1,1,1,1
FountainPens,1,1,1,1,1


In [21]:
df_summary[df_summary['product_category'] == 'USBCables'].groupby('sentiment').count()

Unnamed: 0_level_0,product_id,product_name,product_category,summary,keywords
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,6,6,6,6,6
2,10,10,10,10,10
3,31,31,31,31,31
4,62,62,62,62,62
5,52,52,52,52,52


In [27]:
pd.DataFrame(df_summary[(df_summary['product_category'] == 'USBCables') & (df_summary['sentiment'] == 5)]['keywords']).to_csv('test_summary.csv')