# Import Necessary Packages and Libraries

In [1]:
!pip install watermark

!pip install xgboost



In [2]:
# IMAGE PREPROCESSING FUNCTIONS FOR USE IN MODEL DEVELOPMENT, EVALUATION, AND PRODUCTION
import numpy as np
import pandas as pd
import PIL as pil
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import tempfile
import pickle
import time
import gc
import skimage.filters
import cv2
import watermark
import joblib
import math
import sys
from skimage.measure import block_reduce
from image_preprocessing import standardize_image_dataset,resize_dataset,binarize_dataset,crop_dataset,process_dataset_blur,do_pooling_dataset
from pipeline import model_pipeline
from automate_optimal_model_dev import automate_optimal_model_dev
from eval_on_test import make_preds

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.base import clone
from sklearn.metrics import confusion_matrix

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Read in Training Data

In [4]:
training_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_X, train_y = training_data.iloc[:,:-1],training_data.iloc[:,-1]

In [5]:
%%time
all_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_x,train_y = all_data.iloc[:,:-1],all_data.iloc[:,-1]
resized_16_16_train_x = resize_dataset(train_x,(256,256),(16,16))
del all_data
gc.collect()


CPU times: user 2min 37s, sys: 1.11 s, total: 2min 38s
Wall time: 4min 23s


29

### Logistic Regression Test Example

In [6]:
param_grid = {'C':[0.00001,0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter':[100]}

In [7]:
%%time

test_model_ll_l2_1 = model_pipeline()
test_model_ll_l2_1_result = test_model_ll_l2_1.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 5h 50min 49s, sys: 5h 2min 48s, total: 10h 53min 38s
Wall time: 1h 1min 41s


In [8]:
test_model_ll_l2_1_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.8935082094906768,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.565620
 2602    0.002732
 3433    0.013336
 235     0.989061
 1806    0.950505
           ...   
 3330    0.001834
 70      0.589621
 132     0.977175
 2014    0.999782
 1931    0.997194
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8959251101321585,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      

In [9]:
pickle.dump(test_model_ll_l2_1_result,open('linear_model_results/test_model_ll_l2_1_result.pkl','wb'))

In [10]:
%%time

test_model_ll_l2_2 = model_pipeline()
test_model_ll_l2_2_result = test_model_ll_l2_2.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.35]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 5h 50min 19s, sys: 5h 1min 46s, total: 10h 52min 5s
Wall time: 58min 13s


In [11]:
test_model_ll_l2_2_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.8935082094906768,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.565620
 2602    0.002732
 3433    0.013336
 235     0.989061
 1806    0.950505
           ...   
 3330    0.001834
 70      0.589621
 132     0.977175
 2014    0.999782
 1931    0.997194
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8959251101321585,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      

In [12]:
pickle.dump(test_model_ll_l2_2_result,open('linear_model_results/log_reg_test_model_ll_l2_2_result.pkl','wb'))

In [13]:
%%time

test_model_ll_l2_3 = model_pipeline()
test_model_ll_l2_3_result = test_model_ll_l2_3.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(3,3),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)


CPU times: user 5h 39min 22s, sys: 4h 53min 22s, total: 10h 32min 45s
Wall time: 54min 59s


In [14]:
pickle.dump(test_model_ll_l2_3_result,open('linear_model_results/log_reg_test_model_ll_l2_3_result.pkl','wb'))

In [15]:
test_model_ll_l2_3_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.8977095072598795,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.572475
 2602    0.002628
 3433    0.011303
 235     0.991357
 1806    0.927139
           ...   
 3330    0.001347
 70      0.596257
 132     0.968054
 2014    0.999716
 1931    0.997140
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8990876416920099,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      

In [16]:
%%time

test_model_ll_l2_4 = model_pipeline()
test_model_ll_l2_4_result = test_model_ll_l2_4.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 5h 55min 33s, sys: 5h 8min 19s, total: 11h 3min 52s
Wall time: 59min 38s


In [17]:
pickle.dump(test_model_ll_l2_4_result,open('linear_model_results/log_reg_test_model_ll_l2_4_result.pkl','wb'))

In [18]:
test_model_ll_l2_4_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.883061015102316,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.553606
 2602    0.003800
 3433    0.019615
 235     0.985199
 1806    0.986881
           ...   
 3330    0.003126
 70      0.576528
 132     0.974321
 2014    0.998683
 1931    0.996366
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8852369213913998,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1

In [19]:
del test_model_ll_l2_4_result
gc.collect()

24

In [20]:
%%time

test_model_ll_l2_5 = model_pipeline()
test_model_ll_l2_5_result = test_model_ll_l2_5.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.35]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 5h 54min 17s, sys: 5h 7min 19s, total: 11h 1min 37s
Wall time: 58min 29s


In [21]:
pickle.dump(test_model_ll_l2_5_result,open('linear_model_results/log_reg_test_model_ll_l2_5_result.pkl','wb'))
test_model_ll_l2_5_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.883061015102316,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.553606
 2602    0.003800
 3433    0.019615
 235     0.985199
 1806    0.986881
           ...   
 3330    0.003126
 70      0.576528
 132     0.974321
 2014    0.998683
 1931    0.996366
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8852369213913998,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1

In [22]:
del test_model_ll_l2_5_result
gc.collect()

24

In [23]:
%%time

test_model_ll_l2_6 = model_pipeline()
test_model_ll_l2_6_result = test_model_ll_l2_6.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(3,3),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 5h 28min 19s, sys: 4h 46min 32s, total: 10h 14min 52s
Wall time: 57min 20s


In [24]:
pickle.dump(test_model_ll_l2_6_result,open('linear_model_results/log_reg_test_model_ll_l2_6_result.pkl','wb'))
test_model_ll_l2_6_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.0001, solver='liblinear'),
 'best_params': {'C': 0.0001, 'max_iter': 100},
 'best_score': 0.8906506267021607,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.572228
 2602    0.001978
 3433    0.011742
 235     0.990854
 1806    0.955766
           ...   
 3330    0.001423
 70      0.598650
 132     0.977953
 2014    0.999677
 1931    0.997673
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8932413793103449,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      

In [25]:
del test_model_ll_l2_6_result
gc.collect()

24

In [26]:
%%time

test_model_ll_l2_7 = model_pipeline()
test_model_ll_l2_7_result = test_model_ll_l2_7.evaluate(train_X,
                     train_y,
                     preprocessing = [('pool',[(2,2),np.max])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 1h 17min 2s, sys: 1h 5min 8s, total: 2h 22min 11s
Wall time: 14min 18s


In [27]:
pickle.dump(test_model_ll_l2_7_result,open('linear_model_results/log_reg_test_model_ll_l2_7_result.pkl','wb'))
test_model_ll_l2_7_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1e-05, solver='liblinear'),
 'best_params': {'C': 1e-05, 'max_iter': 100},
 'best_score': 0.9429641824012419,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.998566
 2602    0.026213
 3433    0.021751
 235     0.985738
 1806    0.988523
           ...   
 3330    0.005200
 70      0.999990
 132     0.999468
 2014    0.950810
 1931    0.991492
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.24000000000000002,
  'best_score': 0.9441680486456606,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [28]:
del test_model_ll_l2_7_result
gc.collect()

24

In [29]:
%%time

test_model_ll_l2_8 = model_pipeline()
test_model_ll_l2_8_result = test_model_ll_l2_8.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 56min 45s, sys: 40min 7s, total: 1h 36min 52s
Wall time: 19min 11s


In [30]:
pickle.dump(test_model_ll_l2_8_result,open('linear_model_results/log_reg_test_model_ll_l2_8_result.pkl','wb'))
test_model_ll_l2_8_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.935460894129869,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.999998
 2602    0.003764
 3433    0.007116
 235     0.997606
 1806    0.998967
           ...   
 3330    0.000394
 70      1.000000
 132     0.999823
 2014    0.986030
 1931    0.999576
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.2,
  'best_score': 0.9445211150979851,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [31]:
del test_model_ll_l2_8_result
gc.collect()

24

In [32]:
%%time

test_model_ll_l2_9 = model_pipeline()
test_model_ll_l2_9_result = test_model_ll_l2_9.evaluate(train_X,
                     train_y,
                     preprocessing = [('crop',[(256,256),(256,256)])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 3h 44min 55s, sys: 2h 15min 22s, total: 6h 18s
Wall time: 1h 35min 40s


In [33]:
pickle.dump(test_model_ll_l2_9_result,open('linear_model_results/log_reg_test_model_ll_l2_9_result.pkl','wb'))
test_model_ll_l2_9_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1e-05, solver='liblinear'),
 'best_params': {'C': 1e-05, 'max_iter': 100},
 'best_score': 0.9123700333836805,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.997536
 2602    0.008165
 3433    0.001067
 235     0.995494
 1806    0.920955
           ...   
 3330    0.003498
 70      1.000000
 132     0.991579
 2014    0.862646
 1931    0.998249
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.51,
  'best_score': 0.9155807365439095,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [34]:
del test_model_ll_l2_9_result
gc.collect()

24

In [35]:
%%time

test_model_ll_l2_10 = model_pipeline()
test_model_ll_l2_10_result = test_model_ll_l2_10.evaluate(train_X,
                     train_y,
                     preprocessing = [('blur',['b',(256,256),(3,3),0,0])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 2h 16s, sys: 53min 5s, total: 2h 53min 22s
Wall time: 1h 10min 55s


In [36]:
pickle.dump(test_model_ll_l2_10_result,open('linear_model_results/log_reg_test_model_ll_l2_10_result.pkl','wb'))
test_model_ll_l2_10_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1e-05, solver='liblinear'),
 'best_params': {'C': 1e-05, 'max_iter': 100},
 'best_score': 0.9363131612019986,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.999783
 2602    0.011256
 3433    0.012193
 235     0.996656
 1806    0.955667
           ...   
 3330    0.001088
 70      1.000000
 132     0.999927
 2014    0.968932
 1931    0.999903
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.16,
  'best_score': 0.9415333701047987,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [37]:
del test_model_ll_l2_10_result
gc.collect()

24

In [38]:
%%time

test_model_ll_l2_11 = model_pipeline()
test_model_ll_l2_11_result = test_model_ll_l2_11.evaluate(train_X,
                     train_y,
                     preprocessing = [('blur',['g',(256,256),(3,3),0,0])],
                     model = LogisticRegression(solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 2h 1min 18s, sys: 1h 2min 6s, total: 3h 3min 25s
Wall time: 1h 2min 48s


In [39]:
pickle.dump(test_model_ll_l2_11_result,open('linear_model_results/log_reg_test_model_ll_l2_11_result.pkl','wb'))
test_model_ll_l2_11_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1e-05, solver='liblinear'),
 'best_params': {'C': 1e-05, 'max_iter': 100},
 'best_score': 0.9375051502195811,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.999748
 2602    0.010300
 3433    0.011400
 235     0.996747
 1806    0.966711
           ...   
 3330    0.001020
 70      1.000000
 132     0.999925
 2014    0.969586
 1931    0.999877
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.18000000000000002,
  'best_score': 0.9414688017669794,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [40]:
del test_model_ll_l2_11_result
gc.collect()

24