In [None]:
import pandas as pd
import pprint
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
nltk.download('punkt')

import pyLDAvis.gensim_models
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#!python3 -m spacy download en_core_web_sm

from src.models.topic_modelling.LDA import LDAGensim
from src.models.topic_modelling.LSA import LSAModel
from src.data.preprocess import Preprocessor

In [None]:
preprocessor = Preprocessor("../../data/raw/reviews.csv")
preprocessor.clean_csv()
df = preprocessor.clean_df

In [None]:
df = pd.read_csv('../../data/processed/cleaned_reviews.csv')
df.head()

In [None]:
def freq_words(x, terms = 30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms)
    plt.figure(figsize=(20,5))
    ax = sns.barplot(data=d, x= "word", y = "count")
    ax.set(ylabel = 'Count')
    plt.show()

In [None]:
freq_words(df['clean_reviews'])

In [None]:
tokenized_reviews = pd.Series(df['clean_reviews']).apply(lambda x: x.split())
nlp = spacy.load("en_core_web_sm")
def lemmatization(texts, tags):# filter based on tags
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

Filter text based on nouns and adjectives

In [None]:
reviews_2 = lemmatization(tokenized_reviews, tags =['NOUN', 'ADJ'] )
print(reviews_2[1]) # print lemmatized review

In [None]:
freq_words([item for sublist in reviews_2 for item in sublist])

Filter text based on nouns

In [None]:
reviews_3 = lemmatization(tokenized_reviews, tags =['NOUN'] )
print(reviews_3[1])

In [None]:
freq_words([item for sublist in reviews_3 for item in sublist])

Filter text based on adjectives

In [None]:
reviews_4 = lemmatization(tokenized_reviews, tags =['ADJ'] )
print(reviews_4[1])

In [None]:
freq_words([item for sublist in reviews_4 for item in sublist])

### LSA modelling

In [None]:
def LSAmodel(data, no_of_topics):
    vectorizer = TfidfVectorizer(stop_words='english',
    max_features= 1000, # keep top 1000 terms
    max_df = 0.5,
    smooth_idf=True)
    X = vectorizer.fit_transform(data)
    print(X.shape)
    svd_model = TruncatedSVD(n_components=no_of_topics, algorithm='randomized', n_iter=100, random_state=122)
    svd_model.fit(X)
    print(len(svd_model.components_))
    terms = vectorizer.get_feature_names_out()
    for i, comp in enumerate(svd_model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        print("Topic "+str(i)+": ")
        sentence = ""
        print(sorted_terms)

#### LSA without pre-processing (using df)

In [None]:
LSAmodel(df['clean_reviews'],10)

#### LSA with processed_reviews (both lemmatized nouns + adjs)

In [None]:
adj_nouns_reviews = [item for sublist in reviews_2 for item in sublist]
LSAmodel(adj_nouns_reviews,5)

In [None]:
LSAmodel(adj_nouns_reviews,4)

In [None]:
LSAmodel(adj_nouns_reviews,3)

#### LSA with processed_reviews (lemmatized nouns only)

In [None]:
nouns_reviews = [item for sublist in reviews_3 for item in sublist]
LSAmodel(nouns_reviews,6)

In [None]:
LSAmodel(nouns_reviews,5)

In [None]:
LSAmodel(nouns_reviews,4)

In [None]:
LSAmodel(nouns_reviews,3)

#### LSA with processed_reviews (lemmatized adjs only)

In [None]:
adj_reviews = [item for sublist in reviews_4 for item in sublist]
LSAmodel(adj_reviews,10)

LSA assumes a Gaussian distribution of the terms in the documents, which may not be true for all problems.

### LDA (1) using sklearn (do not run as the hyperparameter tuning takes very long)

In [None]:
def lda_sklearn(reviews, numtopics):
    # create a CountVectorizer object
    vectorizer = CountVectorizer(max_df=0.95, min_df=2)

    # fit and transform the clean text data
    X = vectorizer.fit_transform(reviews)

    # Materialize the sparse data
    data_dense = X.todense()

    # Compute Sparsicity
    # Sparsicity is the percentage of non-zero datapoints in X
    print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

    # create an LDA object and fit the data
    lda = LatentDirichletAllocation(n_components=numtopics, random_state=42)
    lda.fit(X)

    # print the top words in each topic
    feature_names = sorted(vectorizer.vocabulary_.keys())
    topic_list = []
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic {topic_idx}:")
        topic_complete = (", ".join([feature_names[i] for i in topic.argsort()[:-15:-1]]))
        print(topic_complete)
        topic_list.append(topic_complete)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda.score(X))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda.perplexity(X))

    # See model parameters
    pprint.pprint(lda.get_params())

Reviews_2 consist of both nouns and adjs only

In [None]:
# reviews_2 consist of both nouns and adjs only
adj_nouns_reviews = [item for sublist in reviews_2 for item in sublist]
lda_sklearn(adj_nouns_reviews,6)

Reviews_3 consist of both nouns only

In [None]:
nouns_reviews = [item for sublist in reviews_3 for item in sublist]
lda_sklearn(nouns_reviews,6)

In [None]:
def search_best_model(reviews):
    # create a CountVectorizer object
    vectorizer = CountVectorizer(max_df=0.95, min_df=2)

    # fit and transform the clean text data
    X = vectorizer.fit_transform(reviews)

    # Define Search Param
    search_params = {'n_components': [3,4,5,6,7], 'learning_decay': [.5, .7, .9]}

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(X)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(X))

    # Get Log Likelyhoods from Grid Search Output
    n_topics = [3,4,5,6,7,8]
    log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
    log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
    log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]

    # Show graph
    plt.figure(figsize=(12, 8))
    plt.plot(n_topics, log_likelyhoods_5, label='0.5')
    plt.plot(n_topics, log_likelyhoods_7, label='0.7')
    plt.plot(n_topics, log_likelyhoods_9, label='0.9')
    plt.title("Choosing Optimal LDA Model")
    plt.xlabel("Num Topics")
    plt.ylabel("Log Likelyhood Scores")
    plt.legend(title='Learning decay', loc='best')
    plt.show()

In [None]:
search_best_model(adj_nouns_reviews)

In [None]:
search_best_model(nouns_reviews)

A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good.
On a different note, perplexity might not be the best measure to evaluate topic models because it doesn’t consider the context and semantic associations between words.

It can be concluded that hyperparameter tuning has not been effective

### LDA (2) using gensim

In [None]:
def lda_gensim(cleaned_reviews, num_topics):
    # create the id2word dictionary
    id2word = corpora.Dictionary(cleaned_reviews)

    # create the corpus
    corpus = [id2word.doc2bow(tokens) for tokens in cleaned_reviews]

    # create the LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
    for element in lda_model.print_topics():
        print('Topic ' + str(element[0]))
        print(element[1])
    return lda_model, corpus, id2word

In [None]:
def lda_viz(lda_model, corpus, id2word):
    # visualize the topics using pyLDAvis
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    return vis

In [None]:
# reviews_1 refers to full text
reviews_1 = []
for review in df['clean_reviews']:
    reviews_1.append(word_tokenize(review))

model, corpus, id2_word = lda_gensim(reviews_1, 5)

In [None]:
model, corpus, id2_word = lda_gensim(reviews_1, 4)

In [None]:
model, corpus, id2_word = lda_gensim(reviews_1, 3)

In [None]:
# reviews_2 consist of both nouns and adjs only
model, corpus, id2_word = lda_gensim(reviews_2, 6)

In [None]:
model, corpus, id2_word = lda_gensim(reviews_2, 5)

In [None]:
model, corpus, id2_word = lda_gensim(reviews_2, 4)

In [None]:
model, corpus, id2_word = lda_gensim(reviews_2, 3)

In [None]:
# reviews_3 consist of nouns only
model, corpus, id2_word =lda_gensim(reviews_3, 6)

In [None]:
model, corpus, id2_word =lda_gensim(reviews_3, 5)

In [None]:
model, corpus, id2_word =lda_gensim(reviews_3, 4)

In [None]:
model, corpus, id2_word =lda_gensim(reviews_3, 3)

In [None]:
# reviews_4 consist of adjs only
model, corpus, id2_word =lda_gensim(reviews_4, 5)

In [None]:
model, corpus, id2_word =lda_gensim(reviews_4, 4)

In [None]:
model, corpus, id2_word =lda_gensim(reviews_4, 3)

Conclusion:
From using texts filtered on nouns only, We can narrow down texts in to 6 main topics:
- Pets
- Baby
- Snacks
- Beverages
- Protein/Food
- Condiments/Products

In [None]:
model, corpus, id2_word =lda_gensim(reviews_3, 6) # texts based on nouns only
lda_viz(model, corpus, id2_word )

### NER (Named Entity Recognition)

In [None]:
nlp = spacy.load("en_core_web_sm")

#### Pet products

In [None]:
def extract_pet_entities(text):
    doc = nlp(text)
    pet_entities = []
    for entity in doc.ents:
        if entity.label_ == "ANIMAL" or "pet" in entity.text.lower() or "dog" in entity.text.lower() or "cat" in entity.text.lower():
            pet_entities.append(entity.text)
    return pet_entities

# apply the extract_pet_entities function to the reviews column
df['pet_entities'] = df['clean_reviews'].apply(extract_pet_entities)

# print the unique pet-related entities that were extracted
pet_entities = set([entity for row in df['pet_entities'] for entity in row])
print(pet_entities)

In [None]:
print(len(pet_entities))

In [None]:
df.drop(['pet_entities'], inplace = True, axis = 1)

In [None]:
def extract_product_entities(text):
    doc = nlp(text)
    product_entities = []
    for entity in doc.ents:
        if entity.label_ == "PRODUCT" or "coffee" in entity.text.lower() or "tea" in entity.text.lower() or "caffeine" in entity.text.lower():
            product_entities.append(entity.text)
    return product_entities


Next steps:
- Explore the brand these reviews are for
- Knowing the domain that this dataset is for, use transfer learning to build a relevant pre-trained model to improve.

### Using LSA Python Class

In [None]:
# Create an instance of LSAModel
lsa_model = LSAModel(df, tags=['NOUN'])
lsa_model.get_topics()

In [None]:
# Create an instance of LDAGensim
lda_model = LDAGensim(df, tags=['NOUN'])
lda_model.get_topics()