### 1. Problem Undestanding
Here we will look at a Data Science challenge within the IMDB space. For our model fitting choose the f1-score metric.

### Import libraries & data 

In [132]:
import matplotlib.pyplot as plt

# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import SCORERS
from sklearn.metrics import f1_score


In [150]:
#definition constants
RANDOM_STATE = 11
MAX_ITER=4000
NUMBER_K_FOLD = 5
TARGET_METRIC = 'f1'
TEST_SIZE = 0.3

In [151]:
#Results of exploring for model we will provided in table model_search_result
models_df = pd.DataFrame(columns=['Model',
                                  'Preprocessing',
                                  'Vectorizer',
                                  'f1 cv',
                                  'f1 test', ], )
models_df

Unnamed: 0,Model,Preprocessing,Vectorizer,f1 cv,f1 test


In [152]:
# import & display data
data = pd.read_csv('data/IMDB_Dataset.csv')

### The Data
Our training set has 50K movie reviews for natural language processing.  This is a dataser for binary sentiment classification. 
For more dataset information, please go through the following link,
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [153]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Check statistics

In [154]:
data.count()

review       50000
sentiment    50000
dtype: int64

In [155]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [156]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


The Dataset contains invalid non-unique values. In the next step research we should drop all data repetitions

In [157]:
data[data.duplicated()].head()

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive
6352,If you liked the Grinch movie... go watch that...,negative
6479,I want very much to believe that the above quo...,negative


#### replace the categoric values from 'sentiment' to numeric

In [158]:
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})

In [159]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


### 2. Splitting data for train and test sets

In [160]:
X = data['review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [161]:
print('Size of train set: ' + str(X_train.size) +
     '\nShare of positive samples in train set: ' + str(y_train[y_train==1].count()/y_train.size))
print('Size of test set: ' + str(X_test.size) +
     '\nShare of positive samples in test set: ' + str(y_test[y_test==1].count()/y_test.size))

Size of train set: 35000
Share of positive samples in train set: 0.5
Size of test set: 15000
Share of positive samples in test set: 0.5


#### Applying CountVectorizer for convert a collection of text documents to a matrix of token counts

In [162]:
# Instantiate the vectorizer
count_vectorizer = CountVectorizer()
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [163]:
# fit and transform on it the training features
count_vectorizer.fit(X_train)
X_train_features = count_vectorizer.transform(X_train)

#transform the test features to sparse matrix
X_test_features = count_vectorizer.transform(X_test)


#### Resulting the vocabulary dictionary 

In [164]:
list(count_vectorizer.vocabulary_.keys())[:20]

['if',
 'you',
 'want',
 'to',
 'see',
 'true',
 'thriller',
 'rent',
 'this',
 'it',
 'not',
 'from',
 'the',
 'director',
 'or',
 'screenwriter',
 'of',
 'scream',
 'doesn',
 'feature']

### 3. Classify

### Grid search cross validation


In [165]:
def grid_search_cv(train: tuple, parameters: list, model_fn):
    kf = KFold(n_splits=NUMBER_K_FOLD)
    
    gs_metrics = {}
    for parameter in parameters:
        cv_metrics = []
        
        folds = kf.split(train[0])
        for train_index, val_index in folds:
            fold_X_train, fold_X_val = train[0][train_index], train[0][val_index]
            fold_y_train, fold_y_val = train[1][train_index], train[1][val_index]
            
            vectorizer.fit(fold_X_train)
            fold_X_train_features = vectorizer.transform(fold_X_train)
            
            fold_X_val_features = vectorizer.transform(fold_X_val)
            
            model = model_fn(C=parameter, random_state=RANDOM_STATE, max_iter=MAX_ITER)
            model.fit(fold_X_train_features, fold_y_train)

            metric = f1_score(model.predict(fold_X_val_features), fold_y_val)
            cv_metrics.append(metric)
            
            cv_score = np.mean(cv_metrics)
            
            gs_metrics[parameter] = cv_score
    return sorted(gs_metrics.items(), key=lambda x: x[1], reverse=True)[0]

In [166]:

vectorizer = CountVectorizer()
hyperparams_C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

X = data.review
y = data.sentiment

result_grid_search = grid_search_cv(train=(X, y), parameters=hyperparams_C, model_fn=LogisticRegression )
result_grid_search

(0.1, 0.8975475069042311)

In [167]:
best_c = result_grid_search[0]

#### Results Cross Validation

### Choose the model after CV


In [168]:
baseline_model=LogisticRegression(C=best_c, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [169]:
baseline_model.fit(X_train_features, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Evaluate the model for the testing data

In [170]:
f1_test = f1_score(y_test, baseline_model.predict(X_test_features))
f1_test

0.8994350282485876

In [171]:
models_df.loc[len(models_df)] = [baseline_model, None, vectorizer, result_grid_search[1], f1_test  ]


In [172]:
models_df

Unnamed: 0,Model,Preprocessing,Vectorizer,f1 cv,f1 test
0,"LogisticRegression(C=0.1, class_weight=None, d...",,"CountVectorizer(analyzer='word', binary=False,...",0.897548,0.899435
