In [1]:
#movie review data (imdb data source) sentiment analysis
#based on Python Machine Learning Book code/Ch8

In [2]:
#import movie review data

import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})

df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [3]:
#Some Sentiment Analysis Concepts / Tutorial

In [4]:
#bag of words concepts
#transform sentences into bag of words vocabulary and sparse vectors
#call fit_transform on the count vectorizer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

#this is the vocabulary learned from the data set
#the key is the word and the val is the indices in the array
#so 'is' is the 2nd col index 1
print(count.vocabulary_)

#this is what the bag of words sparse array looks like
#Those values in the feature vectors are also called the raw term frequencies: 
#tf (t,d)—the number of times a term t occurs in a document d.
#so'is' tf(t,d) = 3 in document 3
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [5]:
#then we apply the tfidf (term frequency-inverse document frequency)
#to the output from the CountVectorizer

#transformer TfidfTransformer takes the raw term frequencies
#from CountVectorizer as input and transforms them into tf-idfs:

#idf(t,d) = log( (1+n_d)/(1+df(d,t)) )
#for 'is' idf = log((1+3)/(1+3) ) = log(1) = 0
#where n_d total docs
#df(d,t) num of docs d cntain term t

#then the tf-idf(t,d) = tf(t,d) x (idf(t,d)+1)
#for 'is' tf-idf(t,d) = (3) x (0+1) = 3 (not normalized)
#L2-normalization, which returns a vector of 
#length 1 by dividing an un-normalized feature vector v by its L2-norm
#v_norm = v_notnormalized / (sum of squares of not normalized values)^0.5
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
#which is the same as applying it to 'bag' from above
print(tfidf.fit_transform(bag).toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]
[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


In [6]:
#Clean the Data from the Movie Reviews

In [7]:
#here is a sample review
df.loc[0, 'review']

'In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich famil

In [8]:
df.loc[1, 'review']

"OK... so... I really like Kris Kristofferson and his usual easy going delivery of lines in his movies. Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly. But, Disappearance is his misstep. Holy Moly, this was a bad movie! <br /><br />I must give kudos to the cinematography and and the actors, including Kris, for trying their darndest to make sense from this goofy, confusing story! None of it made sense and Kris probably didn't understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about! <br /><br />I don't care that everyone on this movie was doing out of love for the project, or some such nonsense... I've seen low budget movies that had a plot for goodness sake! This had none, zilcho, nada, zippo, empty of reason... a complete waste of good talent, scenery and celluloid! <br /><br />I rented this piece of garbage for a buck, and I want my money back! I want my 2 hou

In [9]:
#use regex to clean html and emoticons in data

#find the emoticons in the text 
#based on the syntax and append to the end of the text

import re
def preprocessor(text):
    #remove html by regex using '<[^>]*>' by replacing it with ''
    text = re.sub('<[^>]*>', '', text)
    #find the emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    #remove all non-word characters with regex [\W], 
    #convert to lower case and then join the emoticon at the end
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [10]:
#test the preprocessor to remove the html tags adn the nose of the emoticon
preprocessor("was a bad movie! <br /><br />I mus")

'was a bad movie i mus'

In [11]:
### Apply the preprocessor to the entire data set
df['review'] = df['review'].apply(preprocessor)


In [12]:
#Concept before applying the model - text to tokens

#converting text into 'tokens' is splitting the text into indiv. elements
#can be indiv words 1-gram, or 2 words 2-gram etc

#use porter stemmer from nltk
from nltk.stem.porter import PorterStemmer


porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [13]:
#check the tokenizer function
print(tokenizer('runners like running and thus they run'))
#check the tokenizer_porter function (just the stems)
print(tokenizer_porter('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [14]:
#Concept before applying the model - text to tokens
#using stopwords to remove irrelevant small words

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#use the engllish stop words from nltk
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alisonmichan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
#see how the stopwords works
print('before stopwords: a runner likes running and runs a lot')
print('after stopwords: ', [w for w in tokenizer_porter\
       ('a runner likes running and runs a lot')\
       if w not in stop])

before stopwords: a runner likes running and runs a lot
after stopwords:  ['runner', 'like', 'run', 'run', 'lot']


In [16]:
#split the data train /test 50/50 of the data set

#df.shape is 50k by 2
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [17]:
#import packages for ML model

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [18]:
#let's apply niavely a set of params without optimization

#tfid vectorizer def
#tfidf = TfidfVectorizer(strip_accents=None,
#                        lowercase=None,
#                        preprocessor=None)

#let's set the following for tfidf
#vect_ngram is 1 (1-word elements in the vectorizer)
#stop_words - stop
#tokenizer - tokenizer porter stemmer


#define stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#use the engllish stop words from nltk
stop = stopwords.words('english')

#use porter stemmer from nltk and define tokenizer_porter
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#stop defined above, tokenizer_porter defined above
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=None,
                        tokenizer=tokenizer_porter,
                        stop_words=stop)                       
                        

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alisonmichan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
X_train.shape

(25001,)

In [20]:
#fit/transform the tfidf on the train data and apply to test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

  % sorted(inconsistent)


In [21]:
#classifier is logistic regression with liblinear
clf = LogisticRegression(solver='liblinear', penalty='l2', C=1.0)
clf.fit(X_train_tfidf,y_train)

#prediction and accuracy
y_pred = clf.predict(X_test_tfidf)
test_accuracy = clf.score(X_test_tfidf,y_test)
print('Test Accuracy: ', f'{100*test_accuracy:.3f}','%')

Test Accuracy:  89.264 %


In [24]:
#let's use grid search to find the best hyperparams based on accuracy as a metric

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=None,
                        preprocessor=None)

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear'))])

small_param_grid = [{'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [None],
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
                     'clf__penalty': ['l2'],
                     'clf__C': [1.0, 10.0]},
                    {'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [stop, None],
                     'vect__tokenizer': [tokenizer],
                     'vect__use_idf':[False],
                     'vect__norm':[None],
                     'clf__penalty': ['l2'],
                  'clf__C': [1.0, 10.0]},
              ]
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)



gs_lr_tfidf.fit(X_train, y_train)
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fd967043320>}
CV Accuracy: 0.897
Test Accuracy: 0.899
