## Explore the best model

From previous notebook (model_selection.ipynb) choosed the best model. 

The **best** model is 
    
`SVM(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=False,    random_state=11, shrinking=True, tol=0.001, verbose=False)`
                      
`'f1_cv': 0.9162920983650459`
`'f1_test': 0.913498035559699`

In [15]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

from sklearn import svm
import datetime

import re

import pickle
import json
from datetime import date


In [2]:
import os,sys,inspect
currentdir=os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir=os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from src import preprocessing

In [3]:
#definition constants
RANDOM_STATE = 11
NUMBER_K_FOLD = 5
TARGET_METRIC = 'f1'
TEST_SIZE = 0.3
N_JOBS = 4

In [4]:
# import & display data
data = pd.read_csv('../../data/IMDB_Dataset.csv')
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
data_for_train = data.drop_duplicates()
data_for_train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
X = data_for_train.review
y = data_for_train.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y)

In [7]:
best_model_preprocessing = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1,2), preprocessor=preprocessing.preprocessing_text)),
])
best_model_preprocessing.fit(X_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2',
        preprocessor=<function prepr...f=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None))])

In [8]:
best_model = svm.SVC(kernel='rbf', C=10, gamma=0.1, random_state=RANDOM_STATE)
best_model.fit(best_model_preprocessing.transform(X_train), y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=11, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
f1_score_test = f1_score(best_model.predict(best_model_preprocessing.transform(X_test)), y_test)

In [12]:
f1_score_test

0.913498035559699

### Serialization our best model

Let's train the final MLP model and save it to the destination folder: *`../service/model/svm/`*

In [11]:
folder_path = '../service/model/svm/'

In [16]:
# save the metadata to model 
metadata_to_model = {
    'vectorizer' : str(best_model_preprocessing.steps[0][1]),
    'model_type': 'SVM with rbf kernel',
    'author': 'Tatsiana Drabysheuskaya',
    'data' : str(date.today()),
    'trainig_data' : 'https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews',
    'metrics_cross_validation': {
        'f1_score': "{:5.2f}%".format(100*f1_score_test)
    }
    }

metadata_file_name = folder_path + "svm_model.json"  
    
with open(metadata_file_name, 'w') as file:
    json_string = json.dumps(metadata_to_model, default=lambda o: o.__dict__, sort_keys=True, indent=2)
    file.write(json_string)

                       

In [17]:
# save the model to disk with pickle
model_file_name = folder_path + "svm_final_model.pkl"  

with open(model_file_name, 'wb') as file:  
    pickle.dump(best_model, file)
    
# save the preprocessing to disk with pickle
preproc_file_name = folder_path + "svm_final_model_preproc.pkl"  

with open(preproc_file_name, 'wb') as file:  
    pickle.dump(best_model_preprocessing, file)

In [18]:
# save a few test samples 
test_samples_file_name = folder_path + "svm_modelTest_samples.csv"  

test_df = pd.DataFrame({'review': X_test, 'sentiment': y_test})
test_df.sample(n=8, random_state=RANDOM_STATE).to_csv(test_samples_file_name) 