### 1. Problem Undestanding
Here we will look at a Data Science challenge within the IMDB space. For our model fitting choose the f1-score metric.

Results of exploring for model we will provided in table model_search_result

In [283]:
model_search_result = pd.DataFrame(columns=['Model','F1 test', 'F1 train'], )

### Import libraries & data 

In [228]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import SCORERS
from sklearn.metrics import f1_score

In [229]:
# import & display data
data = pd.read_csv('data/IMDB_Dataset.csv')

### The Data
Our training set has 50K movie reviews for natural language processing.  This is a dataser for binary sentiment classification. 
For more dataset information, please go through the following link,
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [230]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Check statistics

In [231]:
data.count()

review       50000
sentiment    50000
dtype: int64

In [232]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [233]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


#### replace the categoric values from 'sentiment' to numeric

In [234]:
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})

In [235]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


##### Splitting data for train and test sets

In [236]:
X = data['review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42, stratify = y)

#### Applying CountVectorizer for convert a collection of text documents to a matrix of token counts

In [237]:
# Instantiate the vectorizer
count_vectorizer = CountVectorizer()
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [238]:
# fit and transform on it the training features
count_vectorizer.fit(X_train)
X_train_features = count_vectorizer.transform(X_train)

#transform the test features to sparse matrix
X_test_features = count_vectorizer.transform(X_test)


In [239]:
count_vectorizer.vocabulary_

{'if': 37842,
 'you': 86728,
 'want': 84104,
 'to': 78262,
 'see': 68347,
 'true': 79727,
 'thriller': 77741,
 'rent': 64046,
 'this': 77614,
 'it': 40310,
 'not': 53862,
 'from': 30393,
 'the': 77357,
 'director': 21664,
 'or': 55143,
 'screenwriter': 67998,
 'of': 54587,
 'scream': 67965,
 'doesn': 22601,
 'feature': 27992,
 'overacting': 55714,
 'overpaid': 55871,
 'tv': 80104,
 'actors': 1968,
 'passing': 56888,
 'off': 54591,
 'as': 5232,
 'stars': 73346,
 'and': 3844,
 'is': 40193,
 'run': 66244,
 'mill': 49934,
 'special': 72389,
 'effects': 24481,
 'bonanza': 9680,
 'instead': 39398,
 'll': 45620,
 'get': 31766,
 'top': 78545,
 'notch': 53871,
 'edgy': 24358,
 'very': 82974,
 'strong': 74270,
 'in': 38390,
 'violence': 83341,
 'yet': 86597,
 'thrilling': 77743,
 'nailbiter': 52371,
 'one': 54897,
 'first': 28778,
 'best': 8259,
 'columbos': 15800,
 'starring': 73342,
 'robert': 65419,
 'culp': 18656,
 'ray': 62750,
 'milland': 49937,
 'appeared': 4541,
 'on': 54880,
 'another':

### Classify
##### Run a Logistic regression


In [240]:
logreg_model = LogisticRegression(random_state=11, max_iter=10000)
logreg_model.fit(X_train_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [241]:
# Report of metrics
print('Metrics of training:\n'+       classification_report(y_train, logreg_model.predict(X_train_features), digits=4))
print('Metrics of testing:\n'+      classification_report(y_test, logreg_model.predict(X_test_features), digits=4))


Metrics of training:
              precision    recall  f1-score   support

           0     0.9974    0.9975    0.9975     17500
           1     0.9975    0.9974    0.9975     17500

    accuracy                         0.9975     35000
   macro avg     0.9975    0.9975    0.9975     35000
weighted avg     0.9975    0.9975    0.9975     35000

Metrics of testing:
              precision    recall  f1-score   support

           0     0.8908    0.8915    0.8911      7500
           1     0.8914    0.8907    0.8910      7500

    accuracy                         0.8911     15000
   macro avg     0.8911    0.8911    0.8911     15000
weighted avg     0.8911    0.8911    0.8911     15000



In [284]:
# add data to the result table
model_search_result = model_search_result.append({'Model' : logreg_model, 
                           'F1 test' : f1_score(y_train, logreg_model.predict(X_train_features)), 
                            'F1 train' : f1_score(y_test, logreg_model.predict(X_test_features)) 
                           }, ignore_index=True )

model_search_result


Unnamed: 0,Model,F1 test,F1 train
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.997486,0.891023


##### Grid search cross validation


In [246]:
hyperparams_log_reg = {
    'C': [0.001,0.01,0.1,1,10,100], 
}

logreg_model_cross_valid=LogisticRegression(random_state=11, max_iter=4000)
logreg_cv = GridSearchCV(logreg_model_cross_valid, hyperparams_log_reg, scoring ='f1', cv=5, return_train_score=True )
logreg_cv.fit(X_train_features, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=4000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=11, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='f1', verbose=0)

In [249]:
logreg_cv.best_score_

0.8910708045280765

#### Results Cross Validation

In [252]:

logreg_cv.cv_results_
pd.concat(
    [pd.DataFrame(logreg_cv.cv_results_["params"]),
     pd.DataFrame(logreg_cv.cv_results_["mean_test_score"],
                  columns=["f1"])]
    ,axis=1)


Unnamed: 0,C,f1
0,0.001,0.855083
1,0.01,0.886949
2,0.1,0.891071
3,1.0,0.882905
4,10.0,0.875659
5,100.0,0.871657


In [253]:
tuned_baseline_model = logreg_cv.best_estimator_
tuned_baseline_model

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

##### Second tuning hyperparam C

In [254]:
hyperparams_log_reg = {
    'C': [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11], 
}

logreg_cv1 = GridSearchCV(logreg_model_cross_valid, hyperparams_log_reg, scoring ='f1', cv=5, return_train_score=True )
logreg_cv1.fit(X_train_features, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=4000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=11, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='f1', verbose=0)

In [267]:
tuned_baseline_model1 = logreg_cv1.best_estimator_
tuned_baseline_model1

LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [268]:
logreg_cv1.best_score_

0.8928416926159259

#####  Results second tuning

In [269]:

pd.concat(
    [pd.DataFrame(logreg_cv1.cv_results_["params"]),
     pd.DataFrame(logreg_cv1.cv_results_["mean_test_score"],
                  columns=["f1"])]
    ,axis=1)

Unnamed: 0,C,f1
0,0.04,0.892198
1,0.05,0.892645
2,0.06,0.892842
3,0.07,0.892374
4,0.08,0.891807
5,0.09,0.891254
6,0.1,0.891071
7,0.11,0.890678


### Result of Cross validation with 5 folds

In [270]:
result_cv = pd.DataFrame(columns = ['train C','F1-score(train)','test C', 'F1-score(test)'], index = range(0,5))
for splitN in range(5):
    result_cv['F1-score(train)'][splitN] = logreg_cv1.cv_results_['split'+str(splitN)+'_train_score'].max()
    result_cv['F1-score(test)'][splitN] = logreg_cv1.cv_results_['split'+str(splitN)+'_test_score'].max()
    result_cv['train C'][splitN] = logreg_cv1.cv_results_['params'][np.argmax(logreg_cv1.cv_results_['split'+str(splitN)+'_train_score'])]
    result_cv['test C'][splitN] = logreg_cv1.cv_results_['params'][np.argmax(logreg_cv1.cv_results_['split'+str(splitN)+'_test_score'])]


result_cv

Unnamed: 0,train C,F1-score(train),test C,F1-score(test)
0,{'C': 0.11},0.975653,{'C': 0.04},0.89031
1,{'C': 0.11},0.975154,{'C': 0.04},0.886357
2,{'C': 0.11},0.974642,{'C': 0.07},0.89562
3,{'C': 0.11},0.975021,{'C': 0.06},0.894863
4,{'C': 0.11},0.974876,{'C': 0.05},0.899196


In [271]:
logreg_cv1.cv_results_

{'mean_fit_time': array([ 8.24298935,  8.36240463,  9.32571292,  9.80029607, 10.35570083,
        11.20198812, 11.2698411 , 11.65905175]),
 'std_fit_time': array([0.93032574, 0.52249903, 0.83451141, 0.38445516, 0.89630058,
        0.38327872, 0.6599227 , 0.73432435]),
 'mean_score_time': array([0.00997415, 0.01056552, 0.01036944, 0.01017451, 0.01077175,
        0.01037259, 0.01117034, 0.01136932]),
 'std_score_time': array([0.00063151, 0.00185171, 0.00205605, 0.00116381, 0.00171568,
        0.00184995, 0.0009771 , 0.00223857]),
 'param_C': masked_array(data=[0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.04},
  {'C': 0.05},
  {'C': 0.06},
  {'C': 0.07},
  {'C': 0.08},
  {'C': 0.09},
  {'C': 0.1},
  {'C': 0.11}],
 'split0_test_score': array([0.89030975, 0.89008958, 0.89002703, 0.88901537, 0.88958452,
        0.88854093, 0.8883509 , 0.88781

### Choose the best model


In [277]:
baseline_model=LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [278]:
baseline_model.fit(X_train_features, y_train)

LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Report of metrics


In [281]:
# Report of metrics
print('Metrics of training:\n'+       classification_report(y_train, baseline_model.predict(X_train_features), digits=4))
print('Metrics of testing:\n'+      classification_report(y_test, baseline_model.predict(X_test_features), digits=4))


Metrics of training:
              precision    recall  f1-score   support

           0     0.9629    0.9569    0.9599     17500
           1     0.9571    0.9631    0.9601     17500

    accuracy                         0.9600     35000
   macro avg     0.9600    0.9600    0.9600     35000
weighted avg     0.9600    0.9600    0.9600     35000

Metrics of testing:
              precision    recall  f1-score   support

           0     0.9018    0.8941    0.8980      7500
           1     0.8950    0.9027    0.8988      7500

    accuracy                         0.8984     15000
   macro avg     0.8984    0.8984    0.8984     15000
weighted avg     0.8984    0.8984    0.8984     15000



In [285]:
# add data to the result table
model_search_result = model_search_result.append({'Model' : baseline_model, 
                           'F1 test' : f1_score(y_train, baseline_model.predict(X_train_features)), 
                            'F1 train' : f1_score(y_test, baseline_model.predict(X_test_features)) 
                           }, ignore_index=True )

model_search_result

Unnamed: 0,Model,F1 test,F1 train
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.997486,0.891023
1,"LogisticRegression(C=0.06, class_weight=None, ...",0.960125,0.898832
