**Import Data**

In [1]:
#read the reviews
import pandas as pd
df = pd.read_csv("scrape.csv")
df = df.dropna() # drop empty instance

In [2]:
# Drop unnamed columns (which is index from web scrapping)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head(10) # display some data samples

Unnamed: 0,product,review
0,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,"Love these good audio, great battery life, noi..."
1,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,Love these ear buds. I went from android to an...
2,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,"Sound quality is very good, base levels are am..."
3,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,The best sound out there noise canceling is su...
4,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,Best true wireless ear buds I h ever had! Amaz...
5,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,Excellent sound good quality fit in my ears ba...
6,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,These work perfect with mt samsung s22 ultra. ...
7,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,They have very good sound quality they also wo...
8,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,Love them!! Great for work and walking about. ...
9,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,Is good a good time to listening music when I ...


In [3]:
# Check data size
print("Number of instances:", len(df))

Number of instances: 52255


**Sentences Splitting**

In [4]:
#Sentence Splitting
from nltk.tokenize import sent_tokenize
from cleantext import clean

data = pd.DataFrame()

product = []
review = []

for index in df.index:
    sentences = sent_tokenize(df.loc[index, 'review'])
    for sentence in sentences:
        product.append(df.loc[index, 'product'])
        review.append(clean(sentence, no_emoji=True))

data['product'] = product
data['review'] = review

In [5]:
# check size of new dataset
print("Length of dataset after sentence splitting:", len(data))

Length of dataset after sentence splitting: 138850


**Import Modules**

In [6]:
# import library
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import spacy

import gensim.corpora as corpora
import gensim.models as models

from ast import literal_eval
from pprint import pprint

import pickle

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

**Data Preprocessing**

In [7]:
# Text preprocessing for aspect extraction
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    #1. Generating the list of words in the tweet (hastags and other punctuations removed)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    #2. clean the number 
    text = re.sub(r'[0-9]', '', text)
    
    #3. lower the text
    text = text.lower()
    
    #4. conver the emoji to text form
    text = emoji.demojize(text)
    
    #5. remove punctuation 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    #6. tokenize the text
    text = word_tokenize(text)
    
    #7. remove empty token
    text = [t for t in text if len(t) > 0]
    
    #8. remove non-alphabetical token
    text = [t for t in text if t.isalpha()]
    
    #9. replace the negation token
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    #10. remove the stopwords
    text = [i for i in text if i not in stopwords]
    
    #11. lemmatize the text
    text = lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    return text


def lemmatization(sent, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(sent)) 
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return texts_out

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

In [8]:
# Visualize preprocessing sample
print(data.iloc[0][1])
print(type(data.iloc[0][1]))
print(preprocess(data.iloc[0][1]))

love these good audio, great battery life, noise cancellation is great.
<class 'str'>
['love', 'good', 'audio', 'great', 'battery', 'life', 'noise', 'cancellation', 'great']


  print(data.iloc[0][1])
  print(type(data.iloc[0][1]))
  print(preprocess(data.iloc[0][1]))


In [9]:
%%time

#Preprocess the input data
cleaned_text = []

for text in data["review"]:
    cleaned_text.append(preprocess(str(text)))

data["Clean"] = cleaned_text
data = data.dropna()

data.to_csv("preprocessed_data.csv")
print('Preprocessing done.')

Preprocessing done.
CPU times: total: 6min 22s
Wall time: 6min 23s


In [10]:
#Extract 100k from original dataset for BERTopic
pd.read_csv("preprocessed_data.csv").sample(n=100000).to_csv("extracted_data.csv", index=False)

**Import Preprocessed Data**

In [11]:
# import the preprocessed data
extracted_data = pd.read_csv("preprocessed_data.csv")

In [12]:
extracted_data = extracted_data.loc[:, ~extracted_data.columns.str.contains('^Unnamed')]
extracted_data.head()

Unnamed: 0,product,review,Clean
0,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,"love these good audio, great battery life, noi...","['love', 'good', 'audio', 'great', 'battery', ..."
1,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,i used to the original galaxy pro buds and the...,"['use', 'original', 'galaxy', 'pro', 'bud', 'n..."
2,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,love these ear buds.,"['love', 'ear', 'bud']"
3,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,"i went from android to an iphone, and they sti...","['go', 'android', 'iphone', 'still', 'work']"
4,Samsung Galaxy Buds 2 Pro True Wireless Earbud...,"sound quality is very good, base levels are am...","['sound', 'quality', 'good', 'base', 'level', ..."


**LDA Model Training**

In [13]:
%%time
#LDA model training

data_words = []
for x in extracted_data['Clean']:
    data_words.append(literal_eval(x))
    
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# number of topics
num_topics = 50
num_words = 10

# Build LDA model
lda_model1 =models.LdaMulticore(corpus=corpus,
                              id2word=id2word,
                              num_topics=num_topics)

CPU times: total: 7.41 s
Wall time: 25.6 s


**Extract Topics from LDA**

In [14]:
# extract topics from LDA and store in dataframe
topics = lda_model1.print_topics(num_topics=num_topics, num_words=num_words)
topic_list = []

for topic in topics:
    topic_list.append(topic[1])
    
df_topics = pd.DataFrame(topic_list, columns = ['topics'])

**Save Model**

In [15]:
df_topics.to_excel("LDA_topics.xlsx")
pickle.dump(lda_model1, open('LDA_model.model', 'wb'))#save lda model
pickle.dump(id2word, open('LDA_model.dict', 'wb'))#save dictionary
pickle.dump(corpus, open('LDA_model.corpus', 'wb'))#save corpus


**Visualize Results**

In [16]:
pprint(lda_model1.print_topics()) # print top topics 
doc_lda = lda_model1[corpus] # display proportions of topics

[(8,
  '0.092*"work" + 0.049*"use" + 0.042*"well" + 0.023*"bud" + 0.022*"ear" + '
  '0.018*"great" + 0.014*"really" + 0.013*"pair" + 0.011*"day" + 0.010*"good"'),
 (43,
  '0.069*"great" + 0.067*"fit" + 0.053*"ear" + 0.042*"sound" + 0.024*"nice" + '
  '0.022*"good" + 0.019*"bud" + 0.018*"quality" + 0.017*"love" + 0.015*"use"'),
 (3,
  '0.056*"headphone" + 0.044*"good" + 0.034*"work" + 0.022*"sound" + '
  '0.019*"ear" + 0.019*"great" + 0.017*"noise" + 0.014*"go" + 0.013*"well" + '
  '0.011*"really"'),
 (40,
  '0.070*"sound" + 0.050*"quality" + 0.050*"great" + 0.049*"good" + '
  '0.034*"use" + 0.033*"price" + 0.030*"easy" + 0.023*"perfect" + 0.018*"love" '
  '+ 0.015*"product"'),
 (36,
  '0.040*"good" + 0.028*"use" + 0.027*"bud" + 0.026*"sound" + 0.020*"well" + '
  '0.019*"fit" + 0.018*"charge" + 0.017*"case" + 0.015*"ear" + '
  '0.013*"quality"'),
 (9,
  '0.037*"great" + 0.023*"noise" + 0.021*"sound" + 0.017*"also" + '
  '0.016*"quality" + 0.016*"ear" + 0.015*"product" + 0.014*"well" + '

**Load Trained Model**

In [17]:
lda_model_file = 'LDA_model.model' 
loaded_lda_model = pickle.load(open(lda_model_file, 'rb')) #load trained model

loaded_dictionary = pickle.load(open('LDA_model.dict', 'rb')) #load dictionary 
loaded_corpus = pickle.load(open('LDA_model.corpus', 'rb')) # load corpus


**Visualize Results**

In [18]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(loaded_lda_model, loaded_corpus, loaded_dictionary, n_jobs = 1)

In [19]:
vis