### 1. Problem Undestanding
Here we will look at a Data Science challenge within the IMDB space. For our model fitting choose the f1-score metric.

### Import libraries & data 

In [39]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import SCORERS
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

%matplotlib inline

In [58]:
#Results of exploring for model we will provided in table model_search_result
model_search_result = pd.DataFrame(columns=['Model','F1 test', 'F1 train'], )

In [20]:
# import & display data
data = pd.read_csv('data/IMDB_Dataset.csv')

### The Data
Our training set has 50K movie reviews for natural language processing.  This is a dataser for binary sentiment classification. 
For more dataset information, please go through the following link,
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [21]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Check statistics

In [22]:
data.count()

review       50000
sentiment    50000
dtype: int64

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [24]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


#### replace the categoric values from 'sentiment' to numeric

In [25]:
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})

In [26]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


##### Splitting data for train and test sets

In [27]:
X = data['review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42, stratify = y)

#### Applying CountVectorizer for convert a collection of text documents to a matrix of token counts

In [28]:
# Instantiate the vectorizer
count_vectorizer = CountVectorizer()
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [29]:
# fit and transform on it the training features
count_vectorizer.fit(X_train)
X_train_features = count_vectorizer.transform(X_train)

#transform the test features to sparse matrix
X_test_features = count_vectorizer.transform(X_test)


In [32]:
#count_vectorizer.vocabulary_


### Classify
##### Run a Logistic regression


In [33]:
logreg_model = LogisticRegression(random_state=11, max_iter=4000)
logreg_model.fit(X_train_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
print('F1 train set: ' + str(f1_score(y_train, logreg_model.predict(X_train_features))))
print('F1 test set: ' + str(f1_score(y_test, logreg_model.predict(X_test_features))))

F1 train set: 0.9974855706040346
F1 test set: 0.8910230758970255


In [59]:
# add data to the result table
model_search_result = model_search_result.append({'Model' : logreg_model, 
                           'F1 train' : f1_score(y_train, logreg_model.predict(X_train_features)), 
                            'F1 test' : f1_score(y_test, logreg_model.predict(X_test_features)) 
                           }, ignore_index=True )

model_search_result

Unnamed: 0,Model,F1 test,F1 train
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.891023,0.997486


### Testing sepal data with different regularization values

In [43]:
C_param_range = [0.001,0.01,0.1,1,10,100]

sepal_acc_table = pd.DataFrame(columns = ['C_parameter','F1_train', 'F1_test'])
sepal_acc_table['C_parameter'] = C_param_range

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = i,random_state = 11, max_iter=4000)
    lr.fit(X_train_features, y_train)
    
    # Saving F1 score in table
    sepal_acc_table.iloc[j,1] = f1_score(y_train,lr.predict(X_train_features))
    sepal_acc_table.iloc[j,2] = f1_score(y_test,lr.predict(X_test_features))
    j += 1

sepal_acc_table


Unnamed: 0,C_parameter,F1_train,F1_test
0,0.001,0.872818,0.866398
1,0.01,0.9224,0.892467
2,0.1,0.970523,0.899435
3,1.0,0.997486,0.891023
4,10.0,1.0,0.884025
5,100.0,1.0,0.878889


##### Grid search cross validation


In [44]:
hyperparams_log_reg = {
    'C': [0.01,0.03,0.05,0.07,0.09,0.1, 0.13, 0.15], 
}

logreg_model_cross_valid=LogisticRegression(random_state=11, max_iter=4000)
logreg_cv = GridSearchCV(logreg_model_cross_valid, hyperparams_log_reg, scoring ='f1', cv=5, return_train_score=True )
logreg_cv.fit(X_train_features, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=4000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=11, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.13, 0.15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='f1', verbose=0)

In [45]:
logreg_cv.best_score_

0.8926448650461385

#### Results Cross Validation

In [46]:
logreg_cv.cv_results_
pd.concat(
    [pd.DataFrame(logreg_cv.cv_results_["params"]),
     pd.DataFrame(logreg_cv.cv_results_["mean_test_score"],
                  columns=["f1"])]
    ,axis=1)


Unnamed: 0,C,f1
0,0.01,0.886949
1,0.03,0.891142
2,0.05,0.892645
3,0.07,0.892374
4,0.09,0.891254
5,0.1,0.891071
6,0.13,0.89025
7,0.15,0.889565


In [47]:
tuned_baseline_model = logreg_cv.best_estimator_
tuned_baseline_model

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Result of Cross validation with 5 folds

In [50]:
result_cv = pd.DataFrame(columns = ['train C','F1-score(train)','test C', 'F1-score(test)'], index = range(0,5))
for splitN in range(5):
    result_cv['F1-score(train)'][splitN] = logreg_cv.cv_results_['split'+str(splitN)+'_train_score'].max()
    result_cv['F1-score(test)'][splitN] = logreg_cv.cv_results_['split'+str(splitN)+'_test_score'].max()
    result_cv['train C'][splitN] = logreg_cv.cv_results_['params'][np.argmax(logreg_cv.cv_results_['split'+str(splitN)+'_train_score'])]
    result_cv['test C'][splitN] = logreg_cv.cv_results_['params'][np.argmax(logreg_cv.cv_results_['split'+str(splitN)+'_test_score'])]

result_cv

Unnamed: 0,train C,F1-score(train),test C,F1-score(test)
0,{'C': 0.15},0.980983,{'C': 0.03},0.890214
1,{'C': 0.15},0.981065,{'C': 0.03},0.885633
2,{'C': 0.15},0.980146,{'C': 0.07},0.89562
3,{'C': 0.15},0.980543,{'C': 0.05},0.894424
4,{'C': 0.15},0.98088,{'C': 0.05},0.899196


In [51]:
# logreg_cv.cv_results_

### Choose the best model


In [52]:
baseline_model=LogisticRegression(C=0.046, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
baseline_model.fit(X_train_features, y_train)

LogisticRegression(C=0.046, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Report of metrics


In [55]:
# Report of metrics
print('Metrics of training:\n'+       classification_report(y_train, baseline_model.predict(X_train_features), digits=4))
print('Metrics of testing:\n'+      classification_report(y_test, baseline_model.predict(X_test_features), digits=4))


Metrics of training:
              precision    recall  f1-score   support

           0     0.9581    0.9510    0.9546     17500
           1     0.9514    0.9585    0.9549     17500

    accuracy                         0.9547     35000
   macro avg     0.9548    0.9547    0.9547     35000
weighted avg     0.9548    0.9547    0.9547     35000

Metrics of testing:
              precision    recall  f1-score   support

           0     0.9023    0.8939    0.8981      7500
           1     0.8948    0.9032    0.8990      7500

    accuracy                         0.8985     15000
   macro avg     0.8986    0.8985    0.8985     15000
weighted avg     0.8986    0.8985    0.8985     15000



In [60]:
# add data to the result table
model_search_result = model_search_result.append({'Model' : baseline_model, 
                           'F1 train' : f1_score(y_train, baseline_model.predict(X_train_features)), 
                            'F1 test' : f1_score(y_test, baseline_model.predict(X_test_features)) 
                           }, ignore_index=True )

model_search_result

Unnamed: 0,Model,F1 test,F1 train
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.891023,0.997486
1,"LogisticRegression(C=0.046, class_weight=None,...",0.899005,0.95491
