In [None]:
import pandas as pd
import pprint
import spacy
import matplotlib.pyplot as plt

import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import pyLDAvis.gensim_models
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#!python3 -m spacy download en_core_web_sm

In [None]:
df = pd.read_csv('../../data/processed/cleaned_reviews.csv')
df.head()

### LSA

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(df['clean_reviews'])

X.shape # check shape of the document-term matrix

In [None]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

In [None]:
terms = vectorizer.get_feature_names_out()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    sentence = ""
    print(sorted_terms)
    # for t in sorted_terms:
    #     print(t[0])

### LDA (1) using sklearn

In [None]:
# create a CountVectorizer object
vectorizer = CountVectorizer(max_df=0.95, min_df=2)

# fit and transform the clean text data
X = vectorizer.fit_transform(df['clean_reviews'])

# Materialize the sparse data
data_dense = X.todense()

# Compute Sparsicity 
# Sparsicity is the percentage of non-zero datapoints in X
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

# create an LDA object and fit the data
lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda.fit(X)

# print the top 20 words in each topic
feature_names = sorted(vectorizer.vocabulary_.keys())
topic_list = []
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    topic_complete = (", ".join([feature_names[i] for i in topic.argsort()[:-21:-1]]))
    print(topic_complete)
    topic_list.append(topic_complete)

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(X))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(X))

# See model parameters
pprint.pprint(lda.get_params())

A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good.
On a different note, perplexity might not be the best measure to evaluate topic models because it doesn’t consider the context and semantic associations between words.

In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(X)

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(X))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [10, 15, 20, 25, 30]
log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

It can be concluded that hyperparameter tuning has not been effective

### LDA (2) using gensim

In [None]:
# create a list of tokenized reviews without stop words
tokenized_reviews = []
stop_words = set(stopwords.words('english'))
for review in df['clean_reviews']:
    tokens = word_tokenize(review)
    tokens_without_stopwords = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokenized_reviews.append(tokens_without_stopwords)

# create the id2word dictionary
id2word = corpora.Dictionary(tokenized_reviews)

# create the corpus
corpus = [id2word.doc2bow(tokens) for tokens in tokenized_reviews]

# create the LDA model
num_topics = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)

# visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

### NER (Named Entity Recognition)

In [None]:
nlp = spacy.load("en_core_web_sm")

#### Pet products

In [None]:
def extract_pet_entities(text):
    doc = nlp(text)
    pet_entities = []
    for entity in doc.ents:
        if entity.label_ == "ANIMAL" or "pet" in entity.text.lower() or "dog" in entity.text.lower() or "cat" in entity.text.lower():
            pet_entities.append(entity.text)
    return pet_entities

# apply the extract_pet_entities function to the reviews column
df['pet_entities'] = df['clean_reviews'].apply(extract_pet_entities)

# print the unique pet-related entities that were extracted
pet_entities = set([entity for row in df['pet_entities'] for entity in row])
print(pet_entities)

In [None]:
print(len(pet_entities))

In [None]:
df.drop(['pet_entities'], inplace = True, axis = 1)

In [None]:
def extract_product_entities(text):
    doc = nlp(text)
    product_entities = []
    for entity in doc.ents:
        if entity.label_ == "PRODUCT" or "coffee" in entity.text.lower() or "tea" in entity.text.lower() or "caffeine" in entity.text.lower():
            product_entities.append(entity.text)
    return product_entities


Next steps:
- Explore the brand these reviews are for
- Knowing the domain that this dataset is for, use transfer learning to build a relevant pre-trained model to improve.