# Topic Modelling in python - data source is a textfile

In [2]:
 #coding: utf-8 
#encoding=utf-8
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk 
import unicodedata
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
%matplotlib inline
import pyLDAvis
import pyLDAvis.sklearn

### Set working directory ( only if not using a vitual environment)

In [1]:

import os
os.chdir('F:/Library/Analytics Path/Python/Datasets/Topic Modelling')

In [9]:
df = pd.DataFrame()
f = open("rt-polarity-pos.txt", "r+", encoding="latin1")
txt = f.readlines()#reads each line as elements in list 
#f.read() will read whole doc as 1 string
type(txt)

list

In [10]:
imd = pd.DataFrame(txt, columns=["comments"])
imd["row"] = imd.index

In [11]:
imd.head()

Unnamed: 0,comments,row
0,the rock is destined to be the 21st century's ...,0
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic\n,2
3,if you sometimes like to go to the movies to h...,3
4,"emerges as something rare , an issue movie tha...",4


In [12]:
stop_words = nltk.corpus.stopwords.words('english')
extended_stopwords = ['\'ll','\'d','\'m','\'re','\'s','\'ve','ca n\'t','r','n\'t','ca','see','get','movies','movie','go','say','come','many','another','could','would','made','really','want','even','odd','films','plot','ever','actually','also','movie','film']
stops = stop_words + extended_stopwords

#### Check with stemming and lemmatization to clean the data

In [1]:
text = " this is a test for stemmer and stemming "

from nltk.tokenize import word_tokenize
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
stm = PorterStemmer()
lemm = WordNetLemmatizer()

#tokens = [stm.stem(w) for w in tokens]
tokens = [lemm.lemmatize(w) for w in tokens]
#tokens = lemm.lemmatize(tokens)
print(tokens)

NameError: name 'nltk' is not defined

#### Define function to tokenize and lemmatize the data

In [16]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stm = PorterStemmer()
    lemm = WordNetLemmatizer()
    #tokens = [stm.stem(w) for w in tokens]
    tokens = [lemm.lemmatize(w) for w in tokens]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #import pdb;pdb.set_trace()
    return filtered_tokens

##### Texblob sentiment analysis

In [17]:
comm = " This is a beautiful phone, good performance" 

comm_blob = TextBlob(comm)

comm_blob.sentiment.polarity


0.7749999999999999

#### Setiment Analysis with textblob 

In [23]:
imd['polarity'] = imd.comments.apply(lambda s: TextBlob(s).sentiment.polarity)

In [24]:
#imd.head()
imd.tail()

Unnamed: 0,comments,row,polarity
5326,both exuberantly romantic and serenely melanch...,5326,0.0
5327,mazel tov to a film about a family's joyous li...,5327,0.0
5328,standing in the shadows of motown is the best ...,5328,0.465
5329,it's nice to see piscopo again after all these...,5329,0.8
5330,"provides a porthole into that noble , tremblin...",5330,0.6


In [32]:
imd['comments'].head()

0    the rock is destined to be the 21st century's ...
1    the gorgeously elaborate continuation of " the...
2                     effective but too-tepid biopic\n
3    if you sometimes like to go to the movies to h...
4    emerges as something rare , an issue movie tha...
Name: comments, dtype: object

#### Document term matrix with TF-IDF values 

In [25]:
term_idf_vectorizer       = TfidfVectorizer(max_df=0.99, max_features=2000,min_df=0.005, stop_words=stops, use_idf=True, tokenizer=tokenize, ngram_range=(1,1))
%time term_idf_matrix     = term_idf_vectorizer.fit_transform(imd.comments) 
term_idf_feature_names    = term_idf_vectorizer.get_feature_names()
term_idf_matrix.shape


Wall time: 2.49 s


(5331, 300)

In [27]:
term_idf_feature_names

['acting',
 'action',
 'actor',
 'adult',
 'adventure',
 'age',
 'almost',
 'although',
 'always',
 'american',
 'amusing',
 'anyone',
 'anything',
 'around',
 'art',
 'artist',
 'audience',
 'away',
 'back',
 'bad',
 'beautiful',
 'beautifully',
 'beauty',
 'become',
 'best',
 'better',
 'big',
 'bit',
 'book',
 'boy',
 'brilliant',
 'capture',
 'care',
 'cast',
 'character',
 'charm',
 'charming',
 'child',
 'cinema',
 'cinematic',
 'classic',
 'clever',
 'comedy',
 'comic',
 'coming-of-age',
 'compelling',
 'complex',
 'culture',
 'dark',
 'day',
 'de',
 'debut',
 'deeply',
 'delivers',
 'despite',
 'dialogue',
 'different',
 'direction',
 'director',
 'documentary',
 'doe',
 'done',
 'drama',
 'e',
 'easy',
 'effect',
 'effort',
 'emotion',
 'emotional',
 'end',
 'energy',
 'engaging',
 'engrossing',
 'enjoy',
 'enjoyable',
 'enough',
 'entertaining',
 'entertainment',
 'epic',
 'especially',
 'event',
 'every',
 'everyone',
 'experience',
 'eye',
 'face',
 'familiar',
 'family',
 

In [33]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" , ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

#### Topic Modelling using LDA 

In [34]:
lda = LatentDirichletAllocation(n_topics=5, max_iter=10,learning_method='online',learning_offset=10.,random_state=1)
#max_itr - to prevent infinite loop- in each iteration words are assigned to topics,learning_offset = 10,
#random_state = 1  starts assinging each word with a same topic every new time 
%time lda.fit(term_idf_matrix)
print("\nTopics using Latent Dirichlet Allocation model with Term frequencies: \n")
print_top_words(lda, term_idf_feature_names, 10)



Wall time: 8.39 s

Topics using Latent Dirichlet Allocation model with Term frequencies: 

Topic #0:
life documentary year world one character tale humor often moment
Topic #1:
funny good ha look heart fun take de little moving
Topic #2:
comedy never give cast time worth american romantic performance great
Topic #3:
one like doe entertaining work best audience something drama people
Topic #4:
love way director story performance without thriller picture well much



#### Avoid noise in unigrams, TFIDF matrix on bigrams and trigrams

In [35]:
term_idf_vectorizer       = TfidfVectorizer(max_df=0.99, max_features=2000,min_df=0.0005, stop_words=stops, use_idf=True, tokenizer=tokenize, ngram_range=(2,3))
%time term_idf_matrix     = term_idf_vectorizer.fit_transform(imd.comments) 
term_idf_feature_names    = term_idf_vectorizer.get_feature_names()
term_idf_matrix.shape

Wall time: 3.83 s


(5331, 730)

#### Topic Modelling using LDA ( with bigrams and trigams)

In [36]:
lda = LatentDirichletAllocation(n_topics=5, max_iter=10,learning_method='online',learning_offset=10.,random_state=1)
%time lda.fit(term_idf_matrix)
print("\nTopics using Latent Dirichlet Allocation model with Term frequencies: \n")
print_top_words(lda, term_idf_feature_names, 10)



Wall time: 4.02 s

Topics using Latent Dirichlet Allocation model with Term frequencies: 

Topic #0:
subject matter feel like worth seeing best year young woman worth look tell story one man birthday girl hong kong
Topic #1:
character study make u piece work big screen sense humor look like never let hard resist blue crush often funny
Topic #2:
good time special effect de niro one thing give u motion picture work well ha something still manages ensemble cast
Topic #3:
romantic comedy one best new york lead performance austin power one year feature debut pleasant enough edge seat mostly martha
Topic #4:
love story ha created ha enough recent memory psychological drama break heart like mike red dragon guilty pleasure doe give



In [37]:
pyLDAvis.enable_notebook()

In [38]:
pyLDAvis.sklearn.prepare(lda, term_idf_matrix, term_idf_vectorizer)

#### Topic modeling using NMF 

In [55]:
# Fit the NMF model
%time nmf = NMF(n_components=5, random_state=1,alpha=.1, l1_ratio=.5).fit(term_idf_matrix)
#alpha = learning rate
#l1_ratio = 
print("\nFitting the Non-negative Matrix Factorization model with tf-idf features: \n")
print_top_words(nmf, term_idf_feature_names, 10)

CPU times: user 243 ms, sys: 288 ms, total: 531 ms
Wall time: 200 ms

Fitting the Non-negative Matrix Factorization model with tf-idf features: 

Topic #0:
romantic comedy sandra bullock hugh grant interesting character new york much fun start finish point view might well one greatest
Topic #1:
feel like still feel doe feel make feel one feel like one two hour spy kid tv series three hour
Topic #2:
play like like one like bad big screen young woman seem like whole thing point view never seen after-school special
Topic #3:
love story lan yu story one birthday girl edge seat one best like bad good intention good job good thing
Topic #4:
special effect action sequence jackie chan minority report hollywood ending queen damned time machine funny moment after-school special harry potter

