### "Amazon-Alexa" Topic modeling with Gensim

#### https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

### import the required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Gensim
import gensim
import gensim.corpora as corpora
#A corpus is a collection of Document objects.
#Corpora serve two roles in Gensim: Input for training a Model.
#During training, the models use this training corpus to look for common themes and topics,
#initializing their internal model parameters.
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Load the input data ( "amazon alexa reviews data")

In [None]:
# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [None]:
# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
df_amazon_mod = df_amazon[['verified_reviews','feedback']]
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [None]:
import re
def clean(string):
    string = str(string)
    cleanString = re.sub('[^A-Za-z]+',' ', string )
    return cleanString

df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)
df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df_amazon_mod['reviews_Tokenized'] = df_amazon_mod['reviews_len_trim'].map(lambda x: x.lower().split())
df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized'].map(lambda x: " ".join(x for x in x if x not in stop_words))
df_amazon_mod['reviews_Tokenized_stop'] = df_amazon_mod['reviews_Tokenized_stop'].map(lambda x: x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_non_numeric'] = df_amazon_mod['verified_reviews'].map(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_mod['reviews_len_trim'] = df_amazon_mod['reviews_non_numeric'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


In [None]:
df_amazon_mod.head()

Unnamed: 0,verified_reviews,feedback,reviews_non_numeric,reviews_len_trim,reviews_Tokenized,reviews_Tokenized_stop
0,Love my Echo!,1,Love my Echo,Love Echo,"[love, echo]","[love, echo]"
1,Loved it!,1,Loved it,Loved,[loved],[loved]
2,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...,Sometimes while playing game you can answer qu...,"[sometimes, while, playing, game, you, can, an...","[sometimes, playing, game, answer, question, c..."
3,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My yr ...,have had lot fun with this thing old learns ab...,"[have, had, lot, fun, with, this, thing, old, ...","[lot, fun, thing, old, learns, dinosaurs, cont..."
4,Music,1,Music,Music,[music],[music]


In [None]:
df_amazon_mod['reviews_Tokenized_stop'][0:3]

0                                         [love, echo]
1                                              [loved]
2    [sometimes, playing, game, answer, question, c...
Name: reviews_Tokenized_stop, dtype: object

### Example

In [None]:
from gensim.corpora import Dictionary
import gensim.corpora as corpora
text2word = corpora.Dictionary([['human', 'interface', 'computer']])
print(text2word)
print(text2word[0],text2word[1],text2word[2])
corp = [text2word.doc2bow(txt) for txt in [['human', 'interface', 'computer','human','human']]]
print(corp)

Dictionary<3 unique tokens: ['computer', 'human', 'interface']>
computer human interface
[[(0, 1), (1, 3), (2, 1)]]


##################################

In [None]:
# Create a corpus from a list of texts

texts = df_amazon_mod['reviews_Tokenized_stop']

# 1. Create Dictionary ( constructs word -> id mapping )
id2word = corpora.Dictionary(df_amazon_mod['reviews_Tokenized_stop'])

# Term Document Frequency ( convert document into the bag of words )
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1)]]


In [None]:
id2word[0]

'echo'

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('echo', 1), ('love', 1)]]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # bow representation of docs
                                           id2word=id2word, # dictionary ( word -> id mapping )
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1, # number of docs to be iterated through for each update
                                           chunksize=100, # number of documents to be used in each training chunk
                                           passes=10, # number of passes through the corpus during training
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
print(*lda_model.print_topics(), sep='\n\n')
doc_lda = lda_model[corpus]

(0, '0.082*"device" + 0.041*"know" + 0.020*"computer" + 0.020*"wifi" + 0.018*"though" + 0.018*"fact" + 0.017*"asked" + 0.017*"item" + 0.016*"couple" + 0.016*"screen"')

(1, '0.090*"time" + 0.062*"thing" + 0.041*"want" + 0.032*"best" + 0.030*"control" + 0.023*"add" + 0.020*"say" + 0.018*"favorite" + 0.018*"purchased" + 0.017*"radio"')

(2, '0.032*"hub" + 0.031*"small" + 0.031*"internet" + 0.026*"buy" + 0.026*"two" + 0.024*"dots" + 0.022*"audio" + 0.020*"excellent" + 0.019*"loud" + 0.017*"back"')

(3, '0.138*"echo" + 0.095*"dot" + 0.028*"work" + 0.025*"new" + 0.025*"also" + 0.024*"even" + 0.022*"another" + 0.020*"phone" + 0.018*"sure" + 0.016*"app"')

(4, '0.109*"product" + 0.082*"would" + 0.037*"amazing" + 0.026*"thought" + 0.026*"show" + 0.026*"around" + 0.026*"problems" + 0.026*"family" + 0.021*"talking" + 0.020*"trying"')

(5, '0.072*"far" + 0.059*"learning" + 0.048*"still" + 0.040*"pretty" + 0.040*"fine" + 0.034*"skills" + 0.027*"loves" + 0.022*"learn" + 0.020*"gets" + 0.020*"part"'

In [None]:
# Compute Perplexity (accounts for the level of uncertainty in a model's prediction)
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score (level of semantic similarity between words on a topic - higher is better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_amazon_mod['reviews_Tokenized_stop'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.546352543940072

Coherence Score:  0.3956226272767262


### LDA Mallet Model (Gensim uses a Variational Bayes sampling method which is faster but less precise that Mallet's Gibbs Sampling)

In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

--2024-04-12 16:35:29--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://mallet.cs.umass.edu/dist/mallet-2.0.8.zip [following]
--2024-04-12 16:35:29--  https://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
Saving to: ‘mallet-2.0.8.zip’


2024-04-12 16:35:31 (16.6 MB/s) - ‘mallet-2.0.8.zip’ saved [16184794/16184794]

Archive:  mallet-2.0.8.zip
   creating: mallet-2.0.8/
   creating: mallet-2.0.8/bin/
  inflating: mallet-2.0.8/bin/classifier2info  
  inflating: mallet-2.0.8/bin/csv2classify  
  inflating: mallet-2.0.8/bin/csv2vectors  
  inflating: mallet-2.0.8/bin/mallet  
  inflating: mallet

In [None]:
mallet_path = '/content/mallet-2.0.8' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

AttributeError: module 'gensim.models' has no attribute 'wrappers'

In [None]:
#Show Topics
print(*ldamallet.show_topics(formatted=False),sep='\n\n')

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=df_amazon_mod['reviews_Tokenized_stop'], dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

NameError: name 'ldamallet' is not defined

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=df_amazon_mod['reviews_Tokenized_stop'], start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[0]
model_topics = optimal_model.show_topics(formatted=False)
print(*optimal_model.print_topics(num_words=10),sep='\n\n\n')



### Finding the dominant topic in each sentence
### Finding the most representative document for each topic
### Topic distributions across documents

### Topic Modeling - LSA model

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(df_amazon_mod['reviews_Tokenized_stop'])

# Create Corpus
texts = df_amazon_mod['reviews_Tokenized_stop']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics
for i in range(2,11):
    lsi = LsiModel(corpus, num_topics=i, id2word=id2word)
    coherence_model = CoherenceModel(model=lsi, texts=df_amazon_mod['reviews_Tokenized_stop'], dictionary=id2word, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

In [None]:
### The coherence score is highest with 10 topics

In [None]:
# perform SVD on the bag of words with the LsiModel to extract 2 topics
lsi = LsiModel(corpus, num_topics=2, id2word=id2word)

In [None]:
# find the 5 words with the srongest association to the derived topics
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

NameError: name 'lsi' is not defined