# Import Necessary Packages and Libraries

In [1]:
# IMAGE PREPROCESSING FUNCTIONS FOR USE IN MODEL DEVELOPMENT, EVALUATION, AND PRODUCTION
import numpy as np
import pandas as pd
import PIL as pil
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import tempfile
import pickle
import time
import gc
import skimage.filters
import cv2
import watermark
import joblib
import math
import sys
from skimage.measure import block_reduce
from image_preprocessing import standardize_image_dataset,resize_dataset,binarize_dataset,crop_dataset,process_dataset_blur,do_pooling_dataset
from pipeline import model_pipeline
from automate_optimal_model_dev import automate_optimal_model_dev
from eval_on_test import make_preds

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.base import clone
from sklearn.metrics import confusion_matrix

  from pandas import MultiIndex, Int64Index


# Load in Optimal Models

In [2]:
%%capture
knn = pickle.load(open('KNN_auto_resize_results/knn_test_12.pickle','rb'))
knn.pop('features')
gbct = pickle.load(open('Partition Based Model Results/gbct1.pkl','rb'))
gbct.pop('features')
log_reg = pickle.load(open('Linear Model Results/logistic_regression.pkl','rb'))
gaussian_nb = pickle.load(open('nb_model_results/top_gaussian_model.pickle','rb'))
gaussian_nb.pop('features')

# Load in Training Labels

In [3]:
labels = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
labels = labels['label']

# Create Training Features Using Out of Sample Predictions of Optimal Models During 5 Fold Cross Validation Process
- These OOS predictions give an accurate idea of how out models make predictions and generalize to OOS Data. Training a model on these features and the original labels will allow a final layer model to learn how to balance predictions from previous models in order to deliver potentially more accurate predictions

In [4]:
features_1 = pd.DataFrame()
features_1['knn'] = knn['oos_probs']
features_1['gbct'] = gbct['oos_probs']
features_1['logreg'] = log_reg['oos_probs']
features_1['gauss_nb'] = gaussian_nb['oos_probs']

features_2 = pd.DataFrame()
features_2['knn'] = knn['oos_preds']
features_2['gbct'] = gbct['oos_preds']
features_2['logreg'] = log_reg['threshold_analysis']['best_preds']
features_2['gauss_nb'] = gaussian_nb['oos_preds']

# ID Optimal Model, Training on OOS Predicted Class 1 Probability and Original Class Labels for Test 1 and Predicted Class and Original Class Labels for Test 2

#### LogReg Test 1

In [5]:
logreg_test1 = model_pipeline().evaluate(features_1,labels,[],LogisticRegression(random_state=50),{'C':[0.000001,0.00001,0.0001,0.001,0.01,1,10]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
logreg_test1

{'best_estimator': LogisticRegression(C=1, random_state=50),
 'best_params': {'C': 1},
 'best_score': 0.9792872904951487,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.993523
 2602    0.022693
 3433    0.002749
 235     0.993543
 1806    0.989284
           ...   
 3330    0.003250
 70      0.976987
 132     0.971825
 2014    0.993234
 1931    0.993041
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.47000000000000003,
  'best_score': 0.9801509644953873,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [6]:
pickle.dump(logreg_test1,open('Stacked Model Results/logreg.pkl','wb'))

#### LogReg Test 2

In [7]:
logreg_test2 = model_pipeline().evaluate(features_2,labels,[],LogisticRegression(random_state=50),{'C':[0.000001,0.00001,0.0001,0.001,0.01,1,10]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
logreg_test2

{'best_estimator': LogisticRegression(C=1, random_state=50),
 'best_params': {'C': 1},
 'best_score': 0.9801529646943108,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.989000
 2602    0.086270
 3433    0.002189
 235     0.989000
 1806    0.989000
           ...   
 3330    0.002865
 70      0.988367
 132     0.988367
 2014    0.988367
 1931    0.988367
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9801509644953873,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Decision Tree Test 1

In [8]:
tree_test1 = model_pipeline().evaluate(features_1,labels,[],DecisionTreeClassifier(),
                                       {'criterion':['gini','entropy'],'min_samples_leaf':[2,3,4,5,6,7,8]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
tree_test1

{'best_estimator': DecisionTreeClassifier(min_samples_leaf=8),
 'best_params': {'criterion': 'gini', 'min_samples_leaf': 8},
 'best_score': 0.9772514393035105,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.0
 2602    0.0
 3433    0.0
 235     1.0
 1806    1.0
        ... 
 3330    0.0
 70      1.0
 132     1.0
 2014    1.0
 1931    1.0
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9753501400560225,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Decision Tree Test 2

In [9]:
tree_test2 = model_pipeline().evaluate(features_2,labels,[],DecisionTreeClassifier(),
                                       {'criterion':['gini','entropy'],'min_samples_leaf':[2,3,4,5,6,7,8]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
tree_test2

{'best_estimator': DecisionTreeClassifier(min_samples_leaf=5),
 'best_params': {'criterion': 'gini', 'min_samples_leaf': 5},
 'best_score': 0.9801638577709826,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.993157
 2602    0.113636
 3433    0.000000
 235     0.993157
 1806    0.993157
           ...   
 3330    0.000000
 70      0.990558
 132     0.990558
 2014    0.990558
 1931    0.990558
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9790794979079498,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Random Forest Test 1

In [10]:
rf_test1 = model_pipeline().evaluate(features_1,labels,[],RandomForestClassifier(random_state=50),
                                       {'min_samples_leaf':[3,4,5,6,7,8],'n_estimators':[500]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
rf_test1

{'best_estimator': RandomForestClassifier(min_samples_leaf=4, n_estimators=500, random_state=50),
 'best_params': {'min_samples_leaf': 4, 'n_estimators': 500},
 'best_score': 0.9781527923991415,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.953846
 2602    0.078430
 3433    0.000000
 235     0.994800
 1806    1.000000
           ...   
 3330    0.000000
 70      0.988112
 132     0.967592
 2014    1.000000
 1931    0.999600
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.52,
  'best_score': 0.9784011220196352,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

#### Random Forest Test 2

In [11]:
rf_test2 = model_pipeline().evaluate(features_2,labels,[],RandomForestClassifier(random_state=50),
                                       {'min_samples_leaf':[3,4,5,6,7,8],'n_estimators':[500]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
rf_test2

{'best_estimator': RandomForestClassifier(min_samples_leaf=3, n_estimators=500, random_state=50),
 'best_params': {'min_samples_leaf': 3, 'n_estimators': 500},
 'best_score': 0.9801529646943108,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.993286
 2602    0.115221
 3433    0.000113
 235     0.993286
 1806    0.993286
           ...   
 3330    0.000047
 70      0.990655
 132     0.990655
 2014    0.990655
 1931    0.990655
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9801509644953873,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Gradient Boosted Classification Tree Test 1

In [12]:
gbct_test1 = model_pipeline().evaluate(features_1,labels,[],GradientBoostingClassifier(max_features='sqrt'),
                                       {'max_depth':[2,3,4,5,6,7,8],'n_estimators':[500]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
gbct_test1

{'best_estimator': GradientBoostingClassifier(max_depth=6, max_features='sqrt', n_estimators=500),
 'best_params': {'max_depth': 6, 'n_estimators': 500},
 'best_score': 0.977014846938841,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      9.999922e-01
 2602    2.633503e-06
 3433    5.586505e-09
 235     1.000000e+00
 1806    1.000000e+00
             ...     
 3330    8.958564e-09
 70      9.999999e-01
 132     1.000000e+00
 2014    1.000000e+00
 1931    9.999999e-01
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.65,
  'best_score': 0.9770050476724621,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

#### Gradient Boosted Classification Tree Test 2

In [13]:
gbct_test2 = model_pipeline().evaluate(features_2,labels,[],GradientBoostingClassifier(max_features='sqrt'),
                                       {'max_depth':[2,3,4,5,6,7,8],'n_estimators':[500]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
gbct_test2

{'best_estimator': GradientBoostingClassifier(max_features='sqrt', n_estimators=500),
 'best_params': {'max_depth': 3, 'n_estimators': 500},
 'best_score': 0.9801529646943108,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      9.931613e-01
 2602    1.137338e-01
 3433    2.905696e-08
 235     9.931613e-01
 1806    9.931613e-01
             ...     
 3330    5.116693e-08
 70      9.905608e-01
 132     9.905608e-01
 2014    9.905608e-01
 1931    9.905608e-01
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9801509644953873,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### KNN Test 1

In [14]:
knn_test1 = model_pipeline().evaluate(features_1,labels,[],KNeighborsClassifier(),
                                       {'p':[1,2,3],'n_neighbors':[1,3,5,7,9,11,13,15,17,19]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
knn_test1

{'best_estimator': KNeighborsClassifier(n_neighbors=19, p=1),
 'best_params': {'n_neighbors': 19, 'p': 1},
 'best_score': 0.9784950149674693,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000
 2602    0.052632
 3433    0.000000
 235     1.000000
 1806    1.000000
           ...   
 3330    0.000000
 70      1.000000
 132     1.000000
 2014    1.000000
 1931    1.000000
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.43,
  'best_score': 0.9787828029034059,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

#### KNN Test 2

In [15]:
knn_test2 = model_pipeline().evaluate(features_2,labels,[],KNeighborsClassifier(),
                                       {'p':[1,2,3],'n_neighbors':[1,3,5,7,9,11,13,15,17,19]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
knn_test2

{'best_estimator': KNeighborsClassifier(n_neighbors=11, p=1),
 'best_params': {'n_neighbors': 11, 'p': 1},
 'best_score': 0.9796286046784791,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000
 2602    0.181818
 3433    0.000000
 235     1.000000
 1806    1.000000
           ...   
 3330    0.000000
 70      1.000000
 132     1.000000
 2014    1.000000
 1931    1.000000
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9796260117220206,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Categorical NB Test 1

In [16]:
cnb_test1 = model_pipeline().evaluate(features_1,labels,[],CategoricalNB(),
                                       {'alpha':[0.001,0.01,0.1,1,5,10,15]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
cnb_test1

{'best_estimator': CategoricalNB(alpha=0.001),
 'best_params': {'alpha': 0.001},
 'best_score': 0.9742452944581022,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.967329
 2602    0.026944
 3433    0.026944
 235     0.967329
 1806    0.967329
           ...   
 3330    0.028339
 70      0.968552
 132     0.968552
 2014    0.968552
 1931    0.968552
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.9742441209406496,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    1
  Name: label, Length: 3220, dtype: uint8}}

#### Categorical NB Test 2

In [17]:
cnb_test2 = model_pipeline().evaluate(features_2,labels,[],CategoricalNB(),
                                       {'alpha':[0.001,0.01,0.1,1,5,10,15]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
cnb_test2

{'best_estimator': CategoricalNB(alpha=0.001),
 'best_params': {'alpha': 0.001},
 'best_score': 0.976926778562832,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.999904
 2602    0.011354
 3433    0.000007
 235     0.999904
 1806    0.999904
           ...   
 3330    0.000009
 70      0.999904
 132     0.999904
 2014    0.999904
 1931    0.999904
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.76,
  'best_score': 0.9801509644953873,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

#### Gaussian NB Test 1

In [18]:
gnb_test1 = model_pipeline().evaluate(features_1,labels,[],GaussianNB(),
                                       {'var_smoothing':[0.00000000001,0.0000000001,0.000000001,0.00000001,0.0000001,0.000001,
                                                         0.00001,0.0001,0.001,0.01,0.1,1,5,10,15]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
gnb_test1

{'best_estimator': GaussianNB(var_smoothing=0.1),
 'best_params': {'var_smoothing': 0.1},
 'best_score': 0.9790378625041903,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000e+00
 2602    3.569945e-10
 3433    1.688292e-12
 235     1.000000e+00
 1806    1.000000e+00
             ...     
 3330    5.055411e-12
 70      9.999999e-01
 132     9.999997e-01
 2014    1.000000e+00
 1931    1.000000e+00
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.26,
  'best_score': 0.9796032411288069,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [19]:
pickle.dump(gnb_test1,open('Stacked Model Results/gnb.pkl','wb'))

#### Gaussian NB Test 2

In [20]:
gnb_test2 = model_pipeline().evaluate(features_2,labels,[],GaussianNB(),
                                       {'var_smoothing':[0.00000000001,0.0000000001,0.000000001,0.00000001,0.0000001,0.000001,
                                                         0.00001,0.0001,0.001,0.01,0.1,1,5,10,15]},
                                         'f1',5,return_transformed_features=False,return_grid=False)
gnb_test2

{'best_estimator': GaussianNB(var_smoothing=0.1),
 'best_params': {'var_smoothing': 0.1},
 'best_score': 0.9785421765424596,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000e+00
 2602    3.717808e-07
 3433    1.255049e-13
 235     1.000000e+00
 1806    1.000000e+00
             ...     
 3330    4.394592e-13
 70      1.000000e+00
 132     1.000000e+00
 2014    1.000000e+00
 1931    1.000000e+00
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.68,
  'best_score': 0.9798882681564246,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}