In [39]:
!pip3 install nltk
!pip3 install googletrans==3.1.0a0



# Data Preperation 

## Import Packages and Data Summary

In [109]:
import pandas as pd
from googletrans import Translator, constants

# Packages for data preprocessing
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# Packages for modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
file_path = "../raw_data/amazon.csv"
df = pd.read_csv(file_path)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [12]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [13]:
df['review_content']

0       Looks durable Charging is fine tooNo complains...
1       I ordered this cable to connect my phone to An...
2       Not quite durable and sturdy,https://m.media-a...
3       Good product,long wire,Charges good,Nice,I bou...
4       Bought this instead of original apple, does th...
                              ...                        
1460    I received product without spanner,Excellent p...
1461    ok,got everything as mentioned but the measuri...
1462    plastic but cool body ,u have to find sturdy s...
1463    I have installed this in my kitchen working fi...
1464    It does it job perfectly..only issue is temp c...
Name: review_content, Length: 1465, dtype: object

In [14]:
df.isna().sum()

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           2
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64

## Data Cleaning (code by Arthur)

In [19]:
reviewers_df = df[["user_id","user_name"]]

In [17]:
# Splitting user ID
reviewers_user_id_split = reviewers_df["user_id"].str.split(",", expand=False).explode()

In [18]:
# To see roughly how many revies we expect
reviewers_user_id_split.shape

(11503,)

In [28]:
# Splitting review content
review_content_split = df["review_content"].str.split(",(?!\s)", expand=False).explode()
review_content_clean = review_content_split.reset_index(drop=True)
review_content_clean[10]

'Value for money, with extra length👍'

In [29]:
review_content_clean.shape

(12138,)

In [31]:
review_content_clean_df = pd.DataFrame(review_content_clean)
review_content_clean_df

Unnamed: 0,review_content
0,Looks durable Charging is fine tooNo complains
1,"Charging is really fast, good product."
2,Till now satisfied with the quality.
3,This is a good product . The charging speed is...
4,"Good quality, would recommend"
...,...
12133,Very good product
12134,"This is a pretty powerful sandwich maker, for ..."
12135,"बोरोसिल ब्रांड का यह ""सेंडविच मेकर"" देखने में ..."
12136,Recommend work as expected


## Translating non-English Reviews

In [53]:
review_content_clean[12135]

'बोरोसिल ब्रांड का यह "सेंडविच मेकर" देखने में तो अच्छा लगता है मगर इसकी बिल्ड क्वालिटी अच्छी नहीं है।  यह लगभग Rs 3000 के आसपास आता है।  इस प्रकार की बिल्ड क्वालिटी ₹2000 के करीब मिल जाती है तो कोई क्यों ₹1000 अधिक भुगतान करें।  पहले मैंने इसे review देखने के बाद ऑर्डर किया था लेकिन जब घर पर डिलीवरी होने के पश्चात  unboxing करके देखा तो इसकी बिल्ड क्वालिटी कुछ खास नहीं लगी इसलिए अपने पैसे बचाने के लिए मैंने इसे वापस भेज दिया। मैंने इसकी पैकेजिंग, मैनुअल और सैंडविच मेकर की फोटोग्राफ आप सभी से शेयर की है।  आप स्वयं देख के अनुमान लगा सकते हैं।'

In [95]:
review_content_clean_df["review_content"]

0           Looks durable Charging is fine tooNo complains
1                   Charging is really fast, good product.
2                     Till now satisfied with the quality.
3        This is a good product . The charging speed is...
4                            Good quality, would recommend
                               ...                        
12133                                    Very good product
12134    This is a pretty powerful sandwich maker, for ...
12135    बोरोसिल ब्रांड का यह "सेंडविच मेकर" देखने में ...
12136                           Recommend work as expected
12137                                      Its easy tp use
Name: review_content, Length: 12138, dtype: object

In [96]:
translator = Translator()

def translate_batch(text_batch):
    translations = translator.translate(text_batch, dest="en")
    return [translation.text for translation in translations]

In [97]:
batch_size = 100
for i in range(0, len(review_content_clean_df), batch_size):
    batch = review_content_clean_df["review_content"].iloc[i:i+batch_size].tolist()
    translated_batch = translate_batch(batch)
    review_content_clean_df["review_content"].iloc[i:i+batch_size] = translated_batch

ReadTimeout: The read operation timed out

In [92]:
review_content_clean_translated = review_content_clean
review_content_clean_translated

['Looks durable Charging is fine tooNo complains',
 'Charging is really fast, good product.',
 'Till now satisfied with the quality.',
 'This is a good product . The charging speed is slower than the original iPhone cable',
 'Good quality, would recommend',
 'https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/81---F1ZgHL._SY88.jpg',
 'Product had worked well till date and was having no issue.Cable is also sturdy enough...Have asked for replacement and company is doing the same...',
 'Value for money',
 "I ordered this cable to connect my phone to Android Auto of car. The cable is really strong and the connection ports are really well made. I already has a Micro USB cable from Ambrane and it's still in good shape. I connected my phone to the car using the cable and it got connected well and no issues. I also connected it to the charging port and yes it has Fast Charging support.",
 "It quality is good at this price and the main thing is that i didn't ever thought that this cabl

## Text Preprocessing

In [103]:
def cleaning_ml(sentence):
    
    sentence = sentence.strip() # remove whitespaces
    sentence = sentence.lower() # lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) # remove numbers
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [105]:
#review_content_clean_df["review_content_clean"] = review_content_clean_df['review_content'].apply(lambda x:cleaning_ml(x))

review_content_clean = review_content_clean_df['review_content'].apply(cleaning_ml)
review_content_clean.head()

0              look durable charge fine toono complain
1                      charge really fast good product
2                                 till satisfy quality
3    good product charge speed slower original ipho...
4                         good quality would recommend
Name: review_content, dtype: object

# Vectorizing

In [108]:
vectorizer = TfidfVectorizer()

vectorized_reviews = vectorizer.fit_transform(review_content_clean)
vectorized_reviews = pd.DataFrame(
    vectorized_reviews.toarray(), 
    columns = vectorizer.get_feature_names_out()
)

vectorized_reviews

Unnamed: 0,aa,aaa,aac,aacha,aage,aame,aamtech,aapka,aapko,aasized,...,सर,सरस,सह,हतर,हम,ಉತ,ತಮ,𝗔𝗱𝗵𝗲𝘀𝗶𝗼𝗻,𝗤𝘂𝗮𝗹𝗶𝘁𝘆,𝗳𝗼𝗿
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modelling: Latent Dirichlet Allocation

In [110]:
# Instantiate the LDA 
n_components = 2
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_reviews)

In [112]:
# Transform
reviews_topic = lda_model.transform(vectorized_reviews)

In [113]:
topic_list = pd.DataFrame(
    lda_model.components_, 
    columns = vectorizer.get_feature_names_out()
)

topic_list

Unnamed: 0,aa,aaa,aac,aacha,aage,aame,aamtech,aapka,aapko,aasized,...,सर,सरस,सह,हतर,हम,ಉತ,ತಮ,𝗔𝗱𝗵𝗲𝘀𝗶𝗼𝗻,𝗤𝘂𝗮𝗹𝗶𝘁𝘆,𝗳𝗼𝗿
0,0.59127,0.517706,0.503302,0.986058,0.502261,0.507443,0.515697,0.749198,0.663594,0.505122,...,0.653003,1.004744,1.114691,0.665388,1.06115,1.201117,1.201117,0.509319,0.509319,0.509319
1,3.762814,1.645782,0.597912,0.510257,0.548756,0.629296,0.726004,0.503445,0.503088,0.627817,...,0.501311,0.504089,0.504681,0.501394,0.503842,0.50599,0.50599,0.657568,0.657568,0.657568


In [114]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")

        topic_df = topic_mixture.iloc[topic]\
            .sort_values(ascending = False).head(top_words)
        
        print(round(topic_df,3))

In [115]:
print_topics(lda_model, vectorizer, 5)

----------
For topic 0, here are the the top 5 words with weights:
good       1360.403
product     808.719
nice        456.023
quality     264.877
money       238.418
Name: 0, dtype: float64
----------
For topic 1, here are the the top 5 words with weights:
work      323.154
use       278.608
charge    227.453
cable     207.980
buy       200.631
Name: 1, dtype: float64
