In [48]:
import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora

review_data= pd.read_csv("Topic_Modeling.csv")

In [51]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [None]:
import nltk
nltk.download('stopwords') # run this one time

In [53]:
review_data.dropna(axis = 0, how ='any',inplace=True)

review_data['Text'] = review_data['Text'].apply(clean_text)
review_data['Num_words_text'] = review_data['Text'].apply(lambda x:len(str(x).split())) 

print('-------Dataset --------')
print(review_data['Score'].value_counts())
print(len(review_data))
print('-------------------------')
max_review_data_sentence_length  = review_data['Num_words_text'].max()

mask = (review_data['Num_words_text'] < 100) & (review_data['Num_words_text'] >=20)
df_short_reviews = review_data[mask]
df_sampled = df_short_reviews.groupby('Score').apply(lambda x: x.sample(n=20000)).reset_index(drop = True)

print('No of Short tweets')
print(len(df_short_reviews))



#all_sentences = train_data['text'].tolist() + test_data['text'].tolist()


-------Dataset --------
5    363111
4     80655
1     52264
3     42638
2     29743
Name: Score, dtype: int64
568411
-------------------------
No of Short reviews
373281


***
Let us pre-process the data
***

In [54]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
df_sampled['Text']=df_sampled['Text'].apply(remove_stopwords)



In [55]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [56]:
 
text_list=df_sampled['Text'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

love stroopwafels tried sorts kind come shipper stroopie stroopwafels nice brand instead count advertised arrived fancy blue need even sticker says count either someone embezzling stroopwafels misleadingly advertised aware read count makes price point pretty different
['love', 'stroopwafel', 'sort', 'shipper', 'stroopie', 'nice', 'brand', 'fancy', 'blue', 'need', 'sticker', 'stroopwafel', 'aware', 'read', 'count', 'price', 'point', 'different']


***
Create vocabulary dictionary and document term matrix
***

In [57]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [58]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=10, random_state=100,
                chunksize=1000, passes=50,iterations=100)

In [59]:
lda_model.print_topics()

[(0,
  '0.036*"flavor" + 0.033*"chip" + 0.028*"snack" + 0.028*"good" + 0.026*"great" + 0.024*"salt" + 0.017*"nice" + 0.016*"candy" + 0.015*"perfect" + 0.014*"little"'),
 (1,
  '0.078*"product" + 0.053*"free" + 0.025*"gluten" + 0.018*"yummy" + 0.017*"cracker" + 0.017*"snack" + 0.011*"package" + 0.010*"pretzel" + 0.010*"shipping" + 0.010*"banana"'),
 (2,
  '0.047*"sauce" + 0.035*"cereal" + 0.032*"good" + 0.027*"cheese" + 0.021*"pasta" + 0.017*"fresh" + 0.017*"seed" + 0.015*"beef" + 0.014*"texture" + 0.013*"taste"'),
 (3,
  '0.041*"product" + 0.039*"time" + 0.031*"order" + 0.028*"great" + 0.018*"good" + 0.016*"small" + 0.014*"package" + 0.013*"size" + 0.013*"bag" + 0.013*"soup"'),
 (4,
  '0.024*"bread" + 0.022*"time" + 0.018*"product" + 0.018*"year" + 0.014*"ginger" + 0.013*"clean" + 0.013*"pill" + 0.012*"stomach" + 0.011*"mild" + 0.011*"week"'),
 (5,
  '0.091*"food" + 0.030*"treat" + 0.024*"dog" + 0.020*"good" + 0.014*"great" + 0.014*"cat" + 0.014*"healthy" + 0.013*"meal" + 0.013*"little

In [63]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=tokenized_reviews, start=2, limit=50, step=1)

In [66]:
# Select the model and print the topics
optimal_model = model_list[7]
model_topics = optimal_model.show_topics(formatted=False)
optimal_model.print_topics(num_words=10)

[(0,
  '0.080*"chocolate" + 0.050*"cookie" + 0.034*"butter" + 0.031*"peanut" + 0.020*"milk" + 0.019*"taste" + 0.018*"great" + 0.016*"flavor" + 0.016*"good" + 0.014*"oatmeal"'),
 (1,
  '0.036*"chip" + 0.025*"salt" + 0.025*"good" + 0.024*"flavor" + 0.018*"great" + 0.014*"product" + 0.013*"store" + 0.011*"love" + 0.011*"fresh" + 0.010*"time"'),
 (2,
  '0.039*"treat" + 0.025*"dog" + 0.021*"time" + 0.020*"small" + 0.018*"product" + 0.015*"great" + 0.014*"size" + 0.013*"little" + 0.011*"year" + 0.011*"large"'),
 (3,
  '0.129*"coffee" + 0.047*"flavor" + 0.031*"good" + 0.027*"strong" + 0.021*"taste" + 0.015*"vanilla" + 0.013*"bean" + 0.013*"blend" + 0.013*"great" + 0.012*"green"'),
 (4,
  '0.048*"water" + 0.027*"drink" + 0.021*"taste" + 0.020*"bottle" + 0.019*"bread" + 0.017*"flavor" + 0.015*"energy" + 0.012*"cold" + 0.012*"product" + 0.012*"good"'),
 (5,
  '0.036*"good" + 0.031*"sweet" + 0.031*"sugar" + 0.028*"flavor" + 0.024*"snack" + 0.023*"taste" + 0.018*"great" + 0.017*"fruit" + 0.017*"ca