# Import Necessary Packages and Libraries

In [1]:
!pip install watermark

!pip install xgboost



In [2]:
# IMAGE PREPROCESSING FUNCTIONS FOR USE IN MODEL DEVELOPMENT, EVALUATION, AND PRODUCTION
import numpy as np
import pandas as pd
import PIL as pil
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import tempfile
import pickle
import time
import gc
import skimage.filters
import cv2
import watermark
import joblib
import math
import sys
from skimage.measure import block_reduce
from image_preprocessing import standardize_image_dataset,resize_dataset,binarize_dataset,crop_dataset,process_dataset_blur,do_pooling_dataset
from pipeline import model_pipeline
from automate_optimal_model_dev import automate_optimal_model_dev
from eval_on_test import make_preds

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.base import clone
from sklearn.metrics import confusion_matrix

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Read in Training Data

In [4]:
training_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_X, train_y = training_data.iloc[:,:-1],training_data.iloc[:,-1]

In [5]:
%%time
all_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_x,train_y = all_data.iloc[:,:-1],all_data.iloc[:,-1]
resized_16_16_train_x = resize_dataset(train_x,(256,256),(16,16))
del all_data
gc.collect()


CPU times: user 2min 37s, sys: 1.21 s, total: 2min 38s
Wall time: 4min 19s


29

### Logistic Regression Test Example

In [6]:
param_grid = {'C':[0.00001,0.0001, 0.001, 0.01, 0.1, 1],
    'max_iter':[100]}

In [7]:
%%time

test_model_ll_l1_1 = model_pipeline()
test_model_ll_l1_1_result = test_model_ll_l1_1.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 52min 41s, sys: 33.3 s, total: 53min 15s
Wall time: 54min 2s


In [8]:
test_model_ll_l1_1_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.8832124476630951,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.991443
 2602    0.000006
 3433    0.001428
 235     0.999577
 1806    0.999849
           ...   
 3330    0.000001
 70      0.996944
 132     0.996254
 2014    0.999999
 1931    0.997842
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.54,
  'best_score': 0.887923544743701,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [9]:
pickle.dump(test_model_ll_l1_1_result,open('linear_model_results/test_model_ll_l1_1_result.pkl','wb'))

In [10]:
%%time

test_model_ll_l1_2 = model_pipeline()
test_model_ll_l1_2_result = test_model_ll_l1_2.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.35]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 1h 47s, sys: 35.4 s, total: 1h 1min 22s
Wall time: 1h 32s


In [11]:
test_model_ll_l1_2_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.8826406893242613,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      9.896758e-01
 2602    1.607766e-05
 3433    1.791067e-03
 235     9.998592e-01
 1806    9.804495e-01
             ...     
 3330    3.666378e-09
 70      9.953596e-01
 132     9.943561e-01
 2014    9.999980e-01
 1931    9.983700e-01
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.54,
  'best_score': 0.8821665226159607,
  'best_preds': array([1, 0, 0,

In [12]:
pickle.dump(test_model_ll_l1_2_result,open('linear_model_results/log_reg_test_model_ll_l1_2_result.pkl','wb'))

In [13]:
%%time

test_model_ll_l1_3 = model_pipeline()
test_model_ll_l1_3_result = test_model_ll_l1_3.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(3,3),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)


CPU times: user 39min 48s, sys: 35.9 s, total: 40min 24s
Wall time: 39min 32s


In [14]:
pickle.dump(test_model_ll_l1_3_result,open('linear_model_results/log_reg_test_model_ll_l1_3_result.pkl','wb'))

In [15]:
test_model_ll_l1_3_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.884862204299646,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      9.863468e-01
 2602    2.079605e-05
 3433    2.002519e-03
 235     9.996002e-01
 1806    9.661654e-01
             ...     
 3330    9.868656e-08
 70      9.983477e-01
 132     9.071070e-01
 2014    1.000000e+00
 1931    9.989690e-01
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.55,
  'best_score': 0.8846265620459168,
  'best_preds': array([1, 0, 0, 

In [16]:
%%time

test_model_ll_l1_4 = model_pipeline()
test_model_ll_l1_4_result = test_model_ll_l1_4.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 46min 32s, sys: 32.4 s, total: 47min 4s
Wall time: 46min 30s


In [17]:
pickle.dump(test_model_ll_l1_4_result,open('linear_model_results/log_reg_test_model_ll_l1_4_result.pkl','wb'))

In [18]:
test_model_ll_l1_4_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.8847431337185302,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.979225
 2602    0.000389
 3433    0.006117
 235     0.997536
 1806    0.995014
           ...   
 3330    0.000072
 70      0.992794
 132     0.994480
 2014    0.999965
 1931    0.997027
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8839561674627705,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
  

In [19]:
del test_model_ll_l1_4_result
gc.collect()

24

In [20]:
%%time

test_model_ll_l1_5 = model_pipeline()
test_model_ll_l1_5_result = test_model_ll_l1_5.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.35]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 50min 12s, sys: 32.8 s, total: 50min 45s
Wall time: 50min 10s


In [21]:
pickle.dump(test_model_ll_l1_5_result,open('linear_model_results/log_reg_test_model_ll_l1_5_result.pkl','wb'))
test_model_ll_l1_5_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.8794214547905824,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.979794
 2602    0.000205
 3433    0.005411
 235     0.997933
 1806    0.997615
           ...   
 3330    0.000150
 70      0.993076
 132     0.996165
 2014    0.999983
 1931    0.997122
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8804042672655811,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
  

In [22]:
del test_model_ll_l1_5_result
gc.collect()

24

In [23]:
%%time

test_model_ll_l1_6 = model_pipeline()
test_model_ll_l1_6_result = test_model_ll_l1_6.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['b',(256,256),(3,3),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 45min 39s, sys: 32.3 s, total: 46min 11s
Wall time: 45min 36s


In [24]:
pickle.dump(test_model_ll_l1_6_result,open('linear_model_results/log_reg_test_model_ll_l1_6_result.pkl','wb'))
test_model_ll_l1_6_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.880297923649114,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    0
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.947958
 2602    0.001494
 3433    0.006968
 235     0.999247
 1806    0.134095
           ...   
 3330    0.000001
 70      0.999276
 132     0.997166
 2014    1.000000
 1931    0.999057
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.51,
  'best_score': 0.8861971830985916,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [25]:
del test_model_ll_l1_6_result
gc.collect()

24

In [26]:
%%time

test_model_ll_l1_7 = model_pipeline()
test_model_ll_l1_7_result = test_model_ll_l1_7.evaluate(train_X,
                     train_y,
                     preprocessing = [('pool',[(2,2),np.max])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 3min 37s, sys: 31.5 s, total: 4min 8s
Wall time: 3min 8s


In [27]:
pickle.dump(test_model_ll_l1_7_result,open('linear_model_results/log_reg_test_model_ll_l1_7_result.pkl','wb'))
test_model_ll_l1_7_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.9363235525265884,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.999997
 2602    0.000644
 3433    0.000667
 235     0.999945
 1806    0.999976
           ...   
 3330    0.000070
 70      1.000000
 132     0.999997
 2014    0.999575
 1931    0.999535
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.18000000000000002,
  'best_score': 0.9409470752089136,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [28]:
del test_model_ll_l1_7_result
gc.collect()

24

In [29]:
%%time

test_model_ll_l1_8 = model_pipeline()
test_model_ll_l1_8_result = test_model_ll_l1_8.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 9min 30s, sys: 1min 38s, total: 11min 8s
Wall time: 10min 29s


In [30]:
pickle.dump(test_model_ll_l1_8_result,open('linear_model_results/log_reg_test_model_ll_l1_8_result.pkl','wb'))
test_model_ll_l1_8_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.9262905158294231,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.989089
 2602    0.056376
 3433    0.103213
 235     0.971055
 1806    0.795412
           ...   
 3330    0.012682
 70      0.999993
 132     0.995695
 2014    0.598141
 1931    0.992058
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.36000000000000004,
  'best_score': 0.9309675630718048,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [31]:
del test_model_ll_l1_8_result
gc.collect()

24

In [32]:
%%time

test_model_ll_l1_9 = model_pipeline()
test_model_ll_l1_9_result = test_model_ll_l1_9.evaluate(train_X,
                     train_y,
                     preprocessing = [('crop',[(256,256),(256,256)])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 16min 2s, sys: 2min 33s, total: 18min 35s
Wall time: 17min 59s


In [33]:
pickle.dump(test_model_ll_l1_9_result,open('linear_model_results/log_reg_test_model_ll_l1_9_result.pkl','wb'))
test_model_ll_l1_9_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 0.1, 'max_iter': 100},
 'best_score': 0.9086007076004121,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      0.998949
 2602    0.002387
 3433    0.000116
 235     0.998492
 1806    0.984465
           ...   
 3330    0.000804
 70      1.000000
 132     0.989196
 2014    0.782617
 1931    0.997159
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.13,
  'best_score': 0.9093859886394374,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [34]:
del test_model_ll_l1_9_result
gc.collect()

24

In [35]:
%%time

test_model_ll_l1_10 = model_pipeline()
test_model_ll_l1_10_result = test_model_ll_l1_10.evaluate(train_X,
                     train_y,
                     preprocessing = [('blur',['b',(256,256),(3,3),0,0])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 13min 43s, sys: 2min 8s, total: 15min 52s
Wall time: 17min 30s


In [36]:
pickle.dump(test_model_ll_l1_10_result,open('linear_model_results/log_reg_test_model_ll_l1_10_result.pkl','wb'))
test_model_ll_l1_10_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.93578960049757,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000
 2602    0.000679
 3433    0.001013
 235     0.999997
 1806    0.998485
           ...   
 3330    0.000027
 70      1.000000
 132     0.999995
 2014    0.998435
 1931    1.000000
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.22,
  'best_score': 0.938969764837626,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [37]:
del test_model_ll_l1_10_result
gc.collect()

24

In [38]:
%%time

test_model_ll_l1_11 = model_pipeline()
test_model_ll_l1_11_result = test_model_ll_l1_11.evaluate(train_X,
                     train_y,
                     preprocessing = [('blur',['g',(256,256),(3,3),0,0])],
                     model = LogisticRegression(penalty='l1', solver='liblinear'),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)

CPU times: user 12min 16s, sys: 2min, total: 14min 16s
Wall time: 13min 34s


In [39]:
pickle.dump(test_model_ll_l1_11_result,open('linear_model_results/log_reg_test_model_ll_l1_11_result.pkl','wb'))
test_model_ll_l1_11_result

{'grid_search': GridSearchCV(cv=5,
              estimator=LogisticRegression(penalty='l1', solver='liblinear'),
              param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                          'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=1, penalty='l1', solver='liblinear'),
 'best_params': {'C': 1, 'max_iter': 100},
 'best_score': 0.9340583420486072,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      1.000000
 2602    0.000621
 3433    0.000936
 235     0.999996
 1806    0.999345
           ...   
 3330    0.000026
 70      1.000000
 132     0.999994
 2014    0.997928
 1931    1.000000
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.12,
  'best_score': 0.9387868670005566,
  'best_preds': array([1, 0, 0, ..., 1, 1, 1])}}

In [40]:
del test_model_ll_l1_11_result
gc.collect()

24