## Step 1: Load the dataset



In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding,LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
print(tf.__version__)

2.4.1


In [103]:
# Load data
task2_df=pd.read_csv('/content/reviews.csv',header=None,names=['reviews'])
task2_df

Unnamed: 0,reviews
0,i did not enjoy the japanese restaurant on par...
1,batman v superman oh my god what an abysmal mo...
2,i went to a restaurant down the street and had...
3,the star wars the rise of skywalker 2 out of 5...
4,it was a very engaging book that was very easy...
...,...
1239,bad moms is a hyperbolic romantic comedy that ...
1240,i didn t understand the point of once upon a t...
1241,recently my wife and i shared a meal together ...
1242,it is not difficult to find pio pio in the lis...


## Step 2: Data Preprocessing ##

We will perform the following steps:

* **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All **stopwords** are removed.
* Words are **lemmatized** - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are **stemmed** - words are reduced to their root form.
* **Bigrams** and **TfIdf** is implemented
* **Limiting frequency** is used


In [104]:

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim import corpora, models

np.random.seed(400)

In [105]:
import nltk 
from nltk import word_tokenize 
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Step 3: Perform Topic Modelling

In [140]:
def main(task2_df,num_classes,stem):

    def lemmatize_stemming(text,stem):
        stemmer = SnowballStemmer("english")
        if stem:
            return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
        else:
            return WordNetLemmatizer().lemmatize(text, pos='v')


    # Tokenize and lemmatize
    def preprocess(text,stem):
        result=[]
        for token in gensim.utils.simple_preprocess(text) :
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token,stem))
                
        return result
    def get_ngrams(text):
        big=[]
        token = nltk.word_tokenize(text)
        bigram = list(ngrams(token, 2))
        for i in bigram:
            big.append(i[0]+' '+i[1])
        return [big]

    processed_docs = []
    for doc in task2_df.reviews:
        processed_docs.append(preprocess(doc,stem))
        s=' '.join(word for i in processed_docs for word in i)    
        bigram=get_ngrams(s)

    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.95, keep_n= 1000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model =  gensim.models.LdaMulticore(corpus_tfidf, 
                                    num_topics = num_classes, 
                                    id2word = dictionary,                                    
                                    passes = 10,
                                    workers = 2)
    
    for idx, topic in lda_model.print_topics(-1):
        print("Topic: {} \nWords: {}".format(idx, topic))
        print("\n")
    return lda_model

Case 1: 3 Topics

In [141]:
lda_model=main(task2_df,3,True)

Topic: 0 
Words: 0.017*"book" + 0.009*"stori" + 0.009*"read" + 0.006*"human" + 0.005*"histori" + 0.005*"author" + 0.005*"world" + 0.005*"charact" + 0.005*"write" + 0.005*"season"


Topic: 1 
Words: 0.011*"restaur" + 0.011*"food" + 0.007*"experi" + 0.007*"servic" + 0.006*"great" + 0.006*"place" + 0.006*"good" + 0.005*"dish" + 0.005*"time" + 0.005*"order"


Topic: 2 
Words: 0.028*"movi" + 0.011*"watch" + 0.011*"film" + 0.009*"plot" + 0.008*"charact" + 0.007*"stori" + 0.007*"act" + 0.005*"action" + 0.005*"storylin" + 0.005*"comedi"




<gensim.models.ldamulticore.LdaMulticore at 0x7f20d9e98410>

Case 2: 4 Topics

In [143]:
lda_model=main(task2_df,4,True)

Topic: 0 
Words: 0.015*"movi" + 0.010*"book" + 0.008*"charact" + 0.008*"stori" + 0.007*"film" + 0.006*"watch" + 0.006*"plot" + 0.005*"seri" + 0.005*"action" + 0.005*"think"


Topic: 1 
Words: 0.010*"movi" + 0.008*"ramen" + 0.006*"sapien" + 0.006*"month" + 0.005*"harari" + 0.005*"watch" + 0.005*"act" + 0.005*"time" + 0.005*"human" + 0.004*"save"


Topic: 2 
Words: 0.014*"restaur" + 0.014*"food" + 0.008*"servic" + 0.008*"experi" + 0.007*"dish" + 0.007*"place" + 0.007*"order" + 0.006*"great" + 0.006*"price" + 0.006*"go"


Topic: 3 
Words: 0.016*"movi" + 0.013*"book" + 0.010*"stori" + 0.008*"read" + 0.007*"film" + 0.007*"life" + 0.006*"watch" + 0.006*"charact" + 0.005*"plot" + 0.005*"think"




Case 3: 3 Topics and Without Stemming

In [146]:
lda_model=main(task2_df,3,False)

Topic: 0 
Words: 0.017*"book" + 0.010*"read" + 0.006*"write" + 0.006*"story" + 0.006*"history" + 0.006*"life" + 0.005*"author" + 0.005*"season" + 0.005*"world" + 0.004*"work"


Topic: 1 
Words: 0.022*"movie" + 0.010*"story" + 0.009*"film" + 0.009*"watch" + 0.008*"book" + 0.008*"plot" + 0.008*"character" + 0.006*"movies" + 0.006*"life" + 0.006*"think"


Topic: 2 
Words: 0.013*"food" + 0.013*"restaurant" + 0.008*"service" + 0.007*"experience" + 0.007*"place" + 0.007*"dish" + 0.006*"great" + 0.006*"order" + 0.006*"good" + 0.006*"price"




## Step 4: Testing model on unseen document ##

In [144]:
unseen_document = '1st half: excellent Interval: 👌 2nd half: just avg Climax twist:\n 👌👌👌 Cast: Nithin, sai chand, Rakul did excellent. Nithiin acting'
print(unseen_document)

1st half: excellent Interval: 👌 2nd half: just avg Climax twist:
 👌👌👌 Cast: Nithin, sai chand, Rakul did excellent. Nithiin acting


In [147]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.7135390639305115	 Topic: 0.022*"movie" + 0.010*"story" + 0.009*"film" + 0.009*"watch" + 0.008*"book"
Score: 0.2437753677368164	 Topic: 0.013*"food" + 0.013*"restaurant" + 0.008*"service" + 0.007*"experience" + 0.007*"place"
Score: 0.042685553431510925	 Topic: 0.017*"book" + 0.010*"read" + 0.006*"write" + 0.006*"story" + 0.006*"history"
