In [1]:
# In this notebook:
# I practice determining movie sentiment with NLP
# AKA predict if a movie comment is positve / negative
# I clean and pre-process text for ML
# I conduct TFIDF and count vectorization
# I conduct logistic regression and Naive Bayes algorithms

In [2]:
import numpy as np
import pandas as pd

In [3]:
from nltk.corpus import stopwords
from textblob import Word

In [4]:
# expand column width for readability
pd.set_option('display.max_colwidth', 70)

In [5]:
morig = pd.read_csv('train.tsv',sep='\t')

morig.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage that what is good fo...,1
1,2,1,A series of escapades demonstrating the adage that what is good fo...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
morig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [7]:
len(morig['SentenceId'].unique())

8529

In [8]:
# bulk of rows are n-grams; remove due to later vectorization 
morig.drop_duplicates(['SentenceId'], inplace=True)

print(morig.shape)

morig.head()

(8529, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage that what is good fo...,1
63,64,2,"This quiet , introspective and entertaining independent is worth s...",4
81,82,3,"Even fans of Ismail Merchant 's work , I suspect , would have a ha...",1
116,117,4,A positively thrilling combination of ethnography and all the intr...,3
156,157,5,Aggressive self-glorification and a manipulative whitewash .,1


In [9]:
morig['Sentiment'].unique()

array([1, 4, 3, 2, 0], dtype=int64)

## Cleaning up phrase text

In [10]:
medit = morig.copy()

medit.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage that what is good fo...,1
63,64,2,"This quiet , introspective and entertaining independent is worth s...",4
81,82,3,"Even fans of Ismail Merchant 's work , I suspect , would have a ha...",1
116,117,4,A positively thrilling combination of ethnography and all the intr...,3
156,157,5,Aggressive self-glorification and a manipulative whitewash .,1


In [11]:
# Need to lowercase, remove low occurance words, stop words, and perform lemmatization

In [12]:
# upon review, 4 looks to be best (liked), 0 looks to be worst (disliked)

medit['Sentiment'].value_counts()

3    2321
1    2200
2    1655
4    1281
0    1072
Name: Sentiment, dtype: int64

In [13]:
medit['Clean'] = medit['Phrase'].apply(lambda x:
                                       " ".join(x.lower()
                                       for x in str(x).split()))
medit['Clean'].head(10)

0      a series of escapades demonstrating the adage that what is good fo...
63     this quiet , introspective and entertaining independent is worth s...
81     even fans of ismail merchant 's work , i suspect , would have a ha...
116    a positively thrilling combination of ethnography and all the intr...
156             aggressive self-glorification and a manipulative whitewash .
166    a comedy-drama of nearly epic proportions rooted in a sincere perf...
198                     narratively , trouble every day is a plodding mess .
213    the importance of being earnest , so thick with wit it plays like ...
247                                    but it does n't leave you with much .
259                                  you could hate it for the same reason .
Name: Clean, dtype: object

In [14]:
medit['Clean'] = medit['Clean'].str.replace('[^\w\s]', ' ')

medit['Clean'].head(10)

0      a series of escapades demonstrating the adage that what is good fo...
63     this quiet   introspective and entertaining independent is worth s...
81     even fans of ismail merchant  s work   i suspect   would have a ha...
116    a positively thrilling combination of ethnography and all the intr...
156             aggressive self glorification and a manipulative whitewash  
166    a comedy drama of nearly epic proportions rooted in a sincere perf...
198                     narratively   trouble every day is a plodding mess  
213    the importance of being earnest   so thick with wit it plays like ...
247                                    but it does n t leave you with much  
259                                  you could hate it for the same reason  
Name: Clean, dtype: object

In [15]:
stop = stopwords.words('english')

medit['Clean'] = medit['Clean'].apply(lambda x:
                                      " ".join(x for x in str(x).split()
                                      if x not in stop))

medit['Clean'].head(10)

0      series escapades demonstrating adage good goose also good gander o...
63                quiet introspective entertaining independent worth seeking
81        even fans ismail merchant work suspect would hard time sitting one
116    positively thrilling combination ethnography intrigue betrayal dec...
156                     aggressive self glorification manipulative whitewash
166    comedy drama nearly epic proportions rooted sincere performance ti...
198                              narratively trouble every day plodding mess
213    importance earnest thick wit plays like reading bartlett familiar ...
247                                                             n leave much
259                                                        could hate reason
Name: Clean, dtype: object

In [16]:
medit['Clean'] = medit['Clean'].apply(lambda x:
                                      " ".join([Word(word).lemmatize(pos='v')
                                      for word in x.split()]))

medit['Clean'].head(10)

0      series escapades demonstrate adage good goose also good gander occ...
63                      quiet introspective entertain independent worth seek
81             even fan ismail merchant work suspect would hard time sit one
116    positively thrill combination ethnography intrigue betrayal deceit...
156                     aggressive self glorification manipulative whitewash
166    comedy drama nearly epic proportion root sincere performance title...
198                                  narratively trouble every day plod mess
213    importance earnest thick wit play like read bartlett familiar quot...
247                                                             n leave much
259                                                        could hate reason
Name: Clean, dtype: object

In [17]:
freq_phrase = pd.Series(' '.join(medit['Clean']).split()).value_counts()

freq_phrase[:20]

film         1294
movie        1008
n             687
make          609
one           576
like          527
rrb           352
lrb           351
story         350
character     348
time          330
good          294
work          289
even          265
much          264
comedy        264
feel          259
see           249
get           241
well          236
dtype: int64

In [18]:
print(f'Current phrase count is: {len(freq_phrase)}')

rare_phrase = freq_phrase[(freq_phrase == 1) | (freq_phrase == 2)]

print(f'Rare phrase count is: {len(rare_phrase)}')

Current phrase count is: 12486
Rare phrase count is: 7513


In [19]:
medit['Clean'] = medit['Clean'].apply(lambda x:
                                      " ".join(x for x in str(x).split()
                                      if x not in rare_phrase))

medit['Clean'].head(10)

0      series demonstrate adage good goose also good occasionally amuse n...
63                      quiet introspective entertain independent worth seek
81                    even fan merchant work suspect would hard time sit one
116    positively thrill combination intrigue betrayal murder shakespeare...
156                                             aggressive self manipulative
166    comedy drama nearly epic proportion root sincere performance title...
198                                  narratively trouble every day plod mess
213            importance earnest thick wit play like read bartlett familiar
247                                                             n leave much
259                                                        could hate reason
Name: Clean, dtype: object

## Conduct ML

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [21]:
tfv = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                        stop_words = 'english', ngram_range=(1,1))

tfvec = tfv.fit_transform(medit['Clean'])

tfvec

<8529x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 40911 stored elements in Compressed Sparse Row format>

In [22]:
# brief look at the transfomed vocab with feature indices

first20vocab = {x: tfv.vocabulary_[x] for x in list(tfv.vocabulary_)[:20]}

first20vocab

{'series': 762,
 'demonstrate': 210,
 'good': 373,
 'occasionally': 588,
 'amuse': 32,
 'story': 832,
 'quiet': 679,
 'entertain': 268,
 'worth': 988,
 'seek': 754,
 'fan': 302,
 'work': 984,
 'hard': 394,
 'time': 893,
 'sit': 785,
 'thrill': 890,
 'intrigue': 459,
 'murder': 570,
 'tragedy': 900,
 'soap': 796}

In [23]:
# brief look at the transformed vocab with IDF values

first20idf = {x: dict(zip(tfv.get_feature_names(), tfv.idf_))[x] for x in 
              list(dict(zip(tfv.get_feature_names(), tfv.idf_)))[:20]}

first20idf

{'10': 7.106905661319284,
 '20': 7.343294439383515,
 '2002': 7.16097288258956,
 '90': 6.793248102464243,
 'ability': 7.106905661319284,
 'able': 6.832468815617524,
 'absolutely': 7.16097288258956,
 'absorb': 7.006822202762302,
 'accomplish': 7.343294439383515,
 'achieve': 7.106905661319284,
 'act': 5.047398334540266,
 'action': 5.07461089806515,
 'actor': 6.524984115869564,
 'actors': 5.733856526949414,
 'actress': 7.055612366931734,
 'actually': 6.0259929497505755,
 'adaptation': 6.9158504245565755,
 'add': 6.495996578996311,
 'adults': 6.873290810137779,
 'adventure': 6.387782994356078}

In [24]:
cv = CountVectorizer()

cvec = cv.fit_transform(medit['Clean'])

cvec

<8529x4947 sparse matrix of type '<class 'numpy.int64'>'
	with 71005 stored elements in Compressed Sparse Row format>

In [25]:
# brief look at the transfomed vocab for count vectorizer

first20cvocab = {x: cv.vocabulary_[x] for x in list(cv.vocabulary_)[:20]}

first20cvocab

{'series': 3864,
 'demonstrate': 1140,
 'adage': 83,
 'good': 1895,
 'goose': 1902,
 'also': 165,
 'occasionally': 3020,
 'amuse': 191,
 'none': 2973,
 'amount': 190,
 'much': 2865,
 'story': 4196,
 'quiet': 3461,
 'introspective': 2324,
 'entertain': 1458,
 'independent': 2221,
 'worth': 4904,
 'seek': 3833,
 'even': 1502,
 'fan': 1621}

In [26]:
(tx_train, tx_test, ty_train, ty_test) = train_test_split(tfvec,medit['Sentiment'], test_size=0.25)

print(f'tx_train: {tx_train.shape}\nty_train: {ty_train.shape}')
print(f'tx_test: {tx_test.shape}\nty_test: {ty_test.shape}')

tx_train: (6396, 1000)
ty_train: (6396,)
tx_test: (2133, 1000)
ty_test: (2133,)


In [27]:
(cx_train, cx_test, cy_train, cy_test) = train_test_split(cvec,medit['Sentiment'], test_size=0.25)

print(f'cx_train: {cx_train.shape}\ncy_train: {cy_train.shape}')
print(f'cx_test: {cx_test.shape}\ncy_test: {cy_test.shape}')

cx_train: (6396, 4947)
cy_train: (6396,)
cx_test: (2133, 4947)
cy_test: (2133,)


In [28]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    tlreg = LogisticRegression(C=c)
    tlreg.fit(tx_train, ty_train)
    print (f'TFIDF Vectorization Accuracy: C={c} {accuracy_score(ty_test, tlreg.predict(tx_test))}')

TFIDF Vectorization Accuracy: C=0.01 0.2873886544772621
TFIDF Vectorization Accuracy: C=0.05 0.33567744960150026
TFIDF Vectorization Accuracy: C=0.25 0.34458509142053445
TFIDF Vectorization Accuracy: C=0.5 0.3534927332395687
TFIDF Vectorization Accuracy: C=1 0.35114861697140176


In [29]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    clreg = LogisticRegression(C=c)
    clreg.fit(cx_train, cy_train)
    print (f'Count Vectorization Accuracy: C={c} {accuracy_score(cy_test, clreg.predict(cx_test))}')

Count Vectorization Accuracy: C=0.01 0.3488045007032349
Count Vectorization Accuracy: C=0.05 0.36146272855133615
Count Vectorization Accuracy: C=0.25 0.37787154242850446
Count Vectorization Accuracy: C=0.5 0.3722456633849039
Count Vectorization Accuracy: C=1 0.3628691983122363


In [30]:
# Naive Bayes model

mnb = MultinomialNB()
mnbmodel = mnb.fit(tx_train, ty_train)

print (f'Naive Bayes Accuracy: {accuracy_score(ty_test, mnbmodel.predict(tx_test))}')

Naive Bayes Accuracy: 0.36099390529770276


In [31]:
test_phrase = ['it was a pretty good movie', 'the flick is mediocre', 'Jake was terrible in the second act',
               'Life Itself is full of cliche, but fits a certain artsy crowd', 'will watch again!',
               'Entertaining movie, shallow story, good action, I would rate 4 out of 5', 'damn good movie!']

test_c_vec = cv.transform(test_phrase)
# test_c_vec yields type mismatch with ML algos; will reinvestigate

test_t_vec = tfv.transform(test_phrase)

In [37]:
test_mnb_pred = mnbmodel.predict(test_t_vec)

for name, pred in zip(test_phrase, test_mnb_pred):
    print(f'Naive Bayes prediction: {pred}   Phrase: {name}')

Naive Bayes prediction: 3   Phrase: it was a pretty good movie
Naive Bayes prediction: 1   Phrase: the flick is mediocre
Naive Bayes prediction: 0   Phrase: Jake was terrible in the second act
Naive Bayes prediction: 2   Phrase: Life Itself is full of cliche, but fits a certain artsy crowd
Naive Bayes prediction: 3   Phrase: will watch again!
Naive Bayes prediction: 1   Phrase: Entertaining movie, shallow story, good action, I would rate 4 out of 5
Naive Bayes prediction: 3   Phrase: damn good movie!


In [38]:
logreg = LogisticRegression(C=1)
logmodel = logreg.fit(tx_train, ty_train)

test_log_pred = logmodel.predict(test_t_vec)

for name, pred in zip(test_phrase, test_log_pred):
    print(f'Logistic regression prediction: {pred}   Phrase: {name}')

Logistic regression prediction: 3   Phrase: it was a pretty good movie
Logistic regression prediction: 1   Phrase: the flick is mediocre
Logistic regression prediction: 0   Phrase: Jake was terrible in the second act
Logistic regression prediction: 2   Phrase: Life Itself is full of cliche, but fits a certain artsy crowd
Logistic regression prediction: 0   Phrase: will watch again!
Logistic regression prediction: 1   Phrase: Entertaining movie, shallow story, good action, I would rate 4 out of 5
Logistic regression prediction: 3   Phrase: damn good movie!


In [34]:
# Closing thoughts for pt 1:
# we see pretty good predictions with TFIDF vectorization + ML
# however overall acuuracy can be imporved
# this will likely require better preprocessing, or ensamble algorithms