In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
filepath_dict = {
    'yelp': './Dataset/yelp_labelled.txt',
    'imdb': './Dataset/imdb_labelled.txt',
    'amzn': './Dataset/amazon_cells_labelled.txt'
}

In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [4]:
df = pd.concat(df_list)
del df_list

In [5]:
df.shape

(2748, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2748 entries, 0 to 999
Data columns (total 3 columns):
sentence    2748 non-null object
label       2748 non-null int64
source      2748 non-null object
dtypes: int64(1), object(2)
memory usage: 85.9+ KB


In [7]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
5,Now I am getting angry and I want my damn pho.,0,yelp
6,Honeslty it didn't taste THAT fresh.),0,yelp
7,The potatoes were like rubber and you could te...,0,yelp
8,The fries were great too.,1,yelp
9,A great touch.,1,yelp


## Data Preprocessing
We will perform the following steps:

> **Tokenization**: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.

> Words that have fewer than 3 characters are removed.

> All **stopwords** are removed.

> Words are **lemmatized** — words in third person are changed to first person and verbs in past and future tenses are changed into present.

> Words are **stemmed** — words are reduced to their root form.

#### Loading gensim and nltk

In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/info/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
## function to perform lemmatize and stem preprocessing steps on the data set.

stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [58]:
### Test on sample

doc_sample = df['sentence'].sample().values[0]
print(doc_sample)
print(preprocess(doc_sample))

I great reception all the time.
['great', 'recept', 'time']


In [66]:
processed_docs = df['sentence'].map(preprocess)

In [67]:
processed_docs.head()

0                                        [love, place]
1                                        [crust, good]
2                               [tasti, textur, nasti]
3    [stop, late, bank, holiday, rick, steve, recom...
4                         [select, menu, great, price]
Name: sentence, dtype: object

## Bag of words on the dataset

In [68]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

219 glad
124 waiter
1182 neglig
2870 raver
375 strip
473 overcook
1318 carb
1388 budget
129 moist
440 avoid
2647 mark


### Gensim filter_extremes

Filter out tokens that appear in

> less than 15 documents (absolute number) or

> more than 0.5 documents (fraction of total corpus size, not absolute number).

> after the above two steps, keep only the first 100000 most frequent tokens.


In [69]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Gensim doc2bow

In [70]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [76]:
## Preview Bag Of Words for our sample preprocessed document

bow_doc_1234 = bow_corpus[1234]
for i in range(len(bow_doc_1234)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1234[i][0], 
                                               dictionary[bow_doc_1234[i][0]], 
bow_doc_1234[i][1]))

Word 3 ("recommend") appears 1 time.
Word 13 ("time") appears 1 time.
Word 71 ("wast") appears 1 time.
Word 157 ("film") appears 1 time.


### TF-IDF

In [79]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [80]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.7436404725978057), (1, 0.6685797241275809)]


### Running LDA using Bag of words

In [81]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [82]:
## For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.091*"film" + 0.062*"phone" + 0.052*"best" + 0.040*"love" + 0.035*"time" + 0.025*"amaz" + 0.021*"wast" + 0.021*"movi" + 0.021*"hear" + 0.021*"qualiti"
Topic: 1 
Words: 0.068*"disappoint" + 0.066*"movi" + 0.059*"pretti" + 0.037*"film" + 0.036*"great" + 0.034*"servic" + 0.026*"time" + 0.024*"best" + 0.022*"total" + 0.021*"real"
Topic: 2 
Words: 0.118*"good" + 0.099*"place" + 0.045*"food" + 0.030*"time" + 0.030*"great" + 0.029*"better" + 0.025*"work" + 0.024*"get" + 0.022*"charger" + 0.022*"money"
Topic: 3 
Words: 0.053*"qualiti" + 0.053*"think" + 0.051*"sound" + 0.050*"know" + 0.043*"come" + 0.042*"wonder" + 0.035*"piec" + 0.024*"hold" + 0.021*"charg" + 0.021*"work"
Topic: 4 
Words: 0.057*"price" + 0.056*"good" + 0.047*"movi" + 0.034*"time" + 0.033*"long" + 0.032*"place" + 0.031*"recommend" + 0.028*"film" + 0.022*"suck" + 0.022*"right"
Topic: 5 
Words: 0.075*"go" + 0.063*"food" + 0.046*"good" + 0.038*"movi" + 0.036*"excel" + 0.035*"want" + 0.033*"order" + 0.032*"watch" 

### Running LDA using tf-idf

In [83]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [84]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.055*"place" + 0.051*"recommend" + 0.051*"disappoint" + 0.048*"film" + 0.041*"delici" + 0.031*"thing" + 0.028*"easi" + 0.026*"servic" + 0.025*"dish" + 0.023*"phone"
Topic: 1 Word: 0.056*"time" + 0.051*"product" + 0.038*"good" + 0.029*"piec" + 0.029*"charact" + 0.027*"item" + 0.027*"break" + 0.021*"film" + 0.020*"receiv" + 0.020*"pictur"
Topic: 2 Word: 0.051*"movi" + 0.044*"pretti" + 0.039*"better" + 0.031*"wait" + 0.028*"worth" + 0.027*"cool" + 0.027*"film" + 0.025*"good" + 0.024*"end" + 0.023*"interest"
Topic: 3 Word: 0.119*"phone" + 0.033*"case" + 0.031*"tri" + 0.031*"go" + 0.030*"highli" + 0.030*"great" + 0.030*"want" + 0.027*"work" + 0.025*"enjoy" + 0.024*"perform"
Topic: 4 Word: 0.064*"place" + 0.060*"food" + 0.046*"money" + 0.042*"order" + 0.041*"purchas" + 0.032*"problem" + 0.027*"wast" + 0.026*"total" + 0.025*"avoid" + 0.023*"funni"
Topic: 5 Word: 0.109*"good" + 0.046*"best" + 0.041*"great" + 0.037*"star" + 0.034*"work" + 0.034*"come" + 0.033*"year" + 0.029*"rig

### Performance Evaluation by classifying using LDA bag of words model

In [85]:
for index, score in sorted(lda_model[bow_corpus[1234]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4707123637199402	 
Topic: 0.091*"film" + 0.062*"phone" + 0.052*"best" + 0.040*"love" + 0.035*"time" + 0.025*"amaz" + 0.021*"wast" + 0.021*"movi" + 0.021*"hear" + 0.021*"qualiti"

Score: 0.3692642152309418	 
Topic: 0.057*"price" + 0.056*"good" + 0.047*"movi" + 0.034*"time" + 0.033*"long" + 0.032*"place" + 0.031*"recommend" + 0.028*"film" + 0.022*"suck" + 0.022*"right"

Score: 0.02000550925731659	 
Topic: 0.045*"good" + 0.042*"act" + 0.041*"thing" + 0.038*"problem" + 0.030*"movi" + 0.029*"recommend" + 0.025*"film" + 0.023*"littl" + 0.023*"think" + 0.022*"love"

Score: 0.020003627985715866	 
Topic: 0.104*"like" + 0.051*"film" + 0.037*"movi" + 0.036*"charact" + 0.026*"look" + 0.020*"nice" + 0.020*"perform" + 0.019*"feel" + 0.018*"time" + 0.018*"play"

Score: 0.02000354416668415	 
Topic: 0.075*"go" + 0.063*"food" + 0.046*"good" + 0.038*"movi" + 0.036*"excel" + 0.035*"want" + 0.033*"order" + 0.032*"watch" + 0.027*"time" + 0.025*"delici"

Score: 0.020003531128168106	 
Topic: 0.118*"

### Performance Evaluation by classifying using LDA TD-IDF model

In [86]:
for index, score in sorted(lda_model_tfidf[bow_corpus[1234]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4767487645149231	 
Topic: 0.055*"place" + 0.051*"recommend" + 0.051*"disappoint" + 0.048*"film" + 0.041*"delici" + 0.031*"thing" + 0.028*"easi" + 0.026*"servic" + 0.025*"dish" + 0.023*"phone"

Score: 0.3632095158100128	 
Topic: 0.064*"place" + 0.060*"food" + 0.046*"money" + 0.042*"order" + 0.041*"purchas" + 0.032*"problem" + 0.027*"wast" + 0.026*"total" + 0.025*"avoid" + 0.023*"funni"

Score: 0.02001214399933815	 
Topic: 0.056*"time" + 0.051*"product" + 0.038*"good" + 0.029*"piec" + 0.029*"charact" + 0.027*"item" + 0.027*"break" + 0.021*"film" + 0.020*"receiv" + 0.020*"pictur"

Score: 0.020008111372590065	 
Topic: 0.077*"love" + 0.040*"servic" + 0.037*"worst" + 0.034*"look" + 0.029*"terribl" + 0.029*"think" + 0.027*"time" + 0.027*"good" + 0.025*"return" + 0.022*"sound"

Score: 0.020006146281957626	 
Topic: 0.066*"like" + 0.060*"film" + 0.043*"nice" + 0.034*"get" + 0.034*"phone" + 0.031*"amaz" + 0.029*"batteri" + 0.024*"flavor" + 0.024*"price" + 0.024*"work"

Score: 0.02000513