### Import libraries & data 

In [13]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from sklearn.model_selection import ParameterSampler

import re

import pickle



In [3]:
#definition constants
RANDOM_STATE = 11
MAX_ITER=4000
NUMBER_K_FOLD = 5
TARGET_METRIC = 'f1'
TEST_SIZE = 0.3
N_JOBS = 4

In [4]:
# set parameters for display dataframe
pd.set_option('precision', 4)
pd.set_option('max_colwidth', 300)

In [58]:
# import & display data
data = pd.read_csv('data/IMDB_Dataset.csv')
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})
data

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO....",1
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only ""has got all the pola...",1
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may...",1
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going t...",0
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a...",1
...,...,...
49995,"I thought this movie did a down right good job. It wasn't as creative or original as the first, but who was expecting it to be. It was a whole lotta fun. the more i think about it the more i like it, and when it comes out on DVD I'm going to pay the money for it very proudly, every last cent. Sh...",1
49996,"Bad plot, bad dialogue, bad acting, idiotic directing, the annoying porn groove soundtrack that ran continually over the overacted script, and a crappy copy of the VHS cannot be redeemed by consuming liquor. Trust me, because I stuck this turkey out to the end. It was so pathetically bad all ove...",0
49997,"I am a Catholic taught in parochial elementary schools by nuns, taught by Jesuit priests in high school & college. I am still a practicing Catholic but would not be considered a ""good Catholic"" in the church's eyes because I don't believe certain things or act certain ways just because the churc...",0
49998,"I'm going to have to disagree with the previous comment and side with Maltin on this one. This is a second rate, excessively vicious Western that creaks and groans trying to put across its central theme of the Wild West being tamed and kicked aside by the steady march of time. It would like to b...",0


In [6]:
# function for preprocessing

def clean_html(data_: tuple):
    data_new = data_[0].copy()
    data_new = data_new.apply(lambda x: re.sub(r'<br.*?>', ' ', x))
    return (data_new, data_[1])

def remove_duplicates(data_: tuple):
    data_new = pd.DataFrame(columns=['X', 'y'])
    data_new.X, data_new.y = data_ 
    data_new = data_new.drop_duplicates()
    return (data_new.X, data_new.y)

def split_digit_letters(data_: tuple):
    data_new = data_[0].copy()
    data_new = data_new.apply(lambda x: re.sub(r'(\d+)', r' \1 ', x))
    data_new = data_new.apply(lambda x: re.sub(r'_+', r' ', x))
    return (data_new, data_[1])


def preprocessing(data_: tuple):
    return clean_html(remove_duplicates(split_digit_letters(data_)))

#### Prepocessing  data for training

In [7]:
data_for_train = preprocessing((data.review, data.sentiment))

In [8]:
X = data_for_train[0]
y = data_for_train[1]
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y)

In [9]:
def model_fit_and_evaluate(pipeline, params_grid):
    grid_cv = GridSearchCV(pipeline, cv=NUMBER_K_FOLD, n_jobs=N_JOBS, param_grid=params_grid, scoring=TARGET_METRIC, 
                            return_train_score=False, verbose=5)
    grid_cv.fit(X_train, y_train)
    model = grid_cv.best_estimator_
    model.fit(X_train, y_train)
    f1_cv = grid_cv.best_score_
    f1_test = f1_score(model.predict(X_test), y_test)
    result = {'grid_cv': grid_cv,
             'model': model,
             'f1_cv': f1_cv,
             'f1_test': f1_test}
    return result

##### Let's evaluate the perfomance different types of models:
    - Support Vector Machines
    - Naive Bayes
    - Decision Trees

### 1. Logistic regression 
Logistic regression is baseline model for this research. In the previous stage has been selected follow model:

In [42]:
pipeline_lr = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])


params_lr ={
    'vect__ngram_range': [(1,2)], 
    'clf__C':[10000], 
    'clf__max_iter': [MAX_ITER],
    'clf__random_state': [RANDOM_STATE],
           }

lr_result = model_fit_and_evaluate(pipeline_lr, params_lr)

lr_result

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.6min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.2min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

### 2. Naive Bayes

In [49]:
pipeline_nb = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])


params_nb ={
    'vect__ngram_range': [(1,2)], 
    'clf__alpha': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1.0e-10], 
           }

nb_result = model_fit_and_evaluate(pipeline_nb, params_nb)
nb_result

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  5.7min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

### 3. Decision Tree

In [50]:
pipeline_dt = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier()),
])


params_dt ={
    'vect__ngram_range': [(1,2)], 
    'clf__criterion':['gini', 'entropy'], 
    'clf__min_samples_split':[2, 5, 10, 20, 30, 40], 
    'clf__min_samples_leaf': [9,10,11,12],
    'clf__max_depth': [9,10,13,15,20,40,60,100],
    'clf__class_weight':['balanced', None],
    'clf__random_state': [RANDOM_STATE],
           }

dt_result = model_fit_and_evaluate(pipeline_dt, params_dt)
dt_result


Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 107.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 290.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 473.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 744.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 954.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 1276.3min
[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed: 1599.1min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

### 4. SVM

In [53]:
pipeline_svm = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', svm.SVC()),
])


params_svm_lin ={
    'vect__ngram_range': [(1,2)],  
    'clf__kernel': ['linear'], 
    'clf__C': [0.0001,0.001,0.01,0.1,1,10,100, 1000],
    'clf__random_state': [RANDOM_STATE],
           }

svm_result_lin = model_fit_and_evaluate(pipeline_svm, params_svm_lin)
svm_result_lin


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 383.8min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

In [12]:
pipeline_svm = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', svm.SVC()),
])


params_svm_poly ={
    'vect__ngram_range': [(1,2)],  
    'clf__kernel': ['poly'], 
    'clf__degree': [2,3,4,5,6],
    'clf__random_state': [RANDOM_STATE],
           }

svm_result_poly = model_fit_and_evaluate(pipeline_svm, params_svm_poly)
svm_result_poly


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 147.9min
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed: 319.0min remaining: 179.4min
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed: 471.8min remaining: 64.3min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 573.4min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

In [9]:
pipeline_svm = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', svm.SVC()),
])


params_svm_rbf ={
    'vect__ngram_range': [(1,2)],  
    'clf__kernel': ['rbf'], 
    'clf__C': [1, 10, 100, 1000],
    'clf__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'clf__random_state': [RANDOM_STATE],
           }

svm_result_rbf = model_fit_and_evaluate(pipeline_svm, params_svm_rbf)
svm_result_rbf

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 49.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 581.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 1776.6min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 2192.2min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

## Choose the best model

The best model is 
SVM(C=10, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=0.1, kernel='rbf', max_iter=-1, probability=False,
    random_state=11, shrinking=True, tol=0.001,
    verbose=False)
                      
'f1_cv': 0.9162920983650459,
'f1_test': 0.913498035559699

In [10]:
pipeline_best = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', svm.SVC()),
])


params_best ={
    'vect__ngram_range': [(1,2)],  
    'clf__kernel': ['rbf'], 
    'clf__C': [10],
    'clf__gamma': [0.1],
    'clf__random_state': [RANDOM_STATE],
           }

result_best = model_fit_and_evaluate(pipeline_best, params_best)
result_best

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed: 90.6min remaining: 135.9min
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 168.4min finished


{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 

In [17]:
result_best

{'grid_cv': GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('vect',
                                         TfidfVectorizer(analyzer='word',
                                                         binary=False,
                                                         decode_error='strict',
                                                         dtype=<class 'numpy.float64'>,
                                                         encoding='utf-8',
                                                         input='content',
                                                         lowercase=True,
                                                         max_df=1.0,
                                                         max_features=None,
                                                         min_df=1,
                                                         ngram_range=(1, 1),
                                 