In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
import glob
import requests
import tarfile
import tqdm
import os

## From: Python Machine Learning by Raschka and Mirjalili

---

## Get the data

In [2]:
df = pd.read_csv('../data/acl_imdb_data.csv',encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [3]:
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


---

## Transform Data

In [4]:
df.loc[1,'review'][:100]

'OK... so... I really like Kris Kristofferson and his usual easy going delivery of lines in his movie'

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

countvect = CountVectorizer()
X = countvect.fit_transform(df.loc[:,'review'].iloc[-3:])

In [6]:
countvect.vocabulary_

{'don': 40,
 'even': 44,
 'know': 79,
 'where': 157,
 'to': 143,
 'begin': 18,
 'on': 102,
 'this': 142,
 'one': 103,
 'it': 74,
 'all': 8,
 'about': 0,
 'the': 139,
 'family': 50,
 'that': 138,
 'has': 59,
 'be': 15,
 'worst': 162,
 'line': 86,
 'of': 101,
 'dialogue': 34,
 'ever': 45,
 'heard': 62,
 'in': 71,
 'horror': 67,
 'movie': 96,
 'although': 10,
 'couldn': 30,
 'if': 70,
 'tried': 146,
 'ugh': 148,
 'and': 12,
 'owen': 106,
 'wilson': 161,
 'is': 73,
 'better': 19,
 'actor': 4,
 'he': 61,
 'needs': 98,
 'stop': 131,
 'playing': 108,
 'token': 144,
 'guy': 56,
 'who': 158,
 'dies': 36,
 'every': 46,
 'action': 3,
 'anaconda': 11,
 'armageddon': 14,
 'after': 6,
 'man': 91,
 'did': 35,
 'co': 28,
 'write': 163,
 'bottle': 21,
 'rocket': 116,
 'rushmore': 118,
 'does': 38,
 'have': 60,
 'some': 125,
 'talent': 135,
 'also': 9,
 'lily': 85,
 'taylor': 136,
 'should': 123,
 'stick': 129,
 'indie': 72,
 'films': 51,
 'she': 122,
 'no': 99,
 'place': 107,
 'here': 64,
 'finally': 5

In [7]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)                             # remove html tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)  # find emoticons, see https://regex101.com/
    # remove non-word chars, lowercase, add back in emoticons
    text = (re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-','')) 
    return text

In [8]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [9]:
preprocessor(df.loc[1,'review'][:100])

'ok so i really like kris kristofferson and his usual easy going delivery of lines in his movie'

In [10]:
#df['review'] = df.review.apply(preprocessor)

In [11]:
def tokenizer(text):
    return text.split() # split on whitespace

In [12]:
tokenizer(preprocessor(df.loc[1,'review'][:52]))

['ok', 'so', 'i', 'really', 'like', 'kris', 'kristofferson', 'and', 'his']

In [13]:
# need to run: conda install -n eods-f20 nltk
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [14]:
tokenizer_porter(preprocessor(df.loc[1,'review'][:52]))

['ok', 'so', 'i', 'realli', 'like', 'kri', 'kristofferson', 'and', 'hi']

In [15]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

[w for w in tokenizer_porter(preprocessor(df.loc[1,'review'][:52])) if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bgibson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['ok', 'realli', 'like', 'kri', 'kristofferson', 'hi']

---

## Train Model

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.sentiment)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf',LogisticRegression(random_state=0,solver='liblinear'))])

In [19]:
# NOTE!! This step takes a long time: ~1 hour on 8-core i7 @ 1.8 GHz

# try with two different parameter grids, one using tfidf, the other only tf
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],                 # try with and without stopword removal
               'vect__tokenizer': [tokenizer,tokenizer_porter], # try with and without stemming
               'clf__penalty': ['l1','l2'],
               'clf__C':[1.0,10.0,100.0],},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'vect__use_idf': [False],                        # try using term frequencies without tf-idf
               'vect__norm': [None],                            # turn of norming only when using tf
               'clf__penalty':['l1','l2'],
               'clf__C': [1.0,10.0,100.0]
              }
             ]

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 58.4min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
 

In [20]:
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
print(f'CV Accuracy: {gs_lr_tfidf.best_score_}:0.3f')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each'

In [22]:
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):0.3f}')

Test Accuracy: 0.897


In [23]:
(y_test == 1).sum() / len(y_test)

0.50328

In [32]:
gs_lr_tfidf.predict(['this was great']),np.round(gs_lr_tfidf.predict_proba(['this was great']))

(array([1]), array([[0., 1.]]))

In [31]:
gs_lr_tfidf.predict(['this was bad']),np.round(gs_lr_tfidf.predict_proba(['this was bad']),2)

(array([0]), array([[1., 0.]]))

In [36]:
gs_lr_tfidf.predict(['this was ok in parts']),np.round(gs_lr_tfidf.predict_proba(['this was ok in parts']))

(array([1]), array([[0., 1.]]))