# Import Necessary Packages and Libraries

In [8]:
!pip install watermark

!pip install xgboost



In [1]:
# IMAGE PREPROCESSING FUNCTIONS FOR USE IN MODEL DEVELOPMENT, EVALUATION, AND PRODUCTION
import numpy as np
import pandas as pd
import PIL as pil
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import tempfile
import pickle
import time
import gc
import skimage.filters
import cv2
import watermark
import joblib
import math
import sys
from skimage.measure import block_reduce
from image_preprocessing import standardize_image_dataset,resize_dataset,binarize_dataset,crop_dataset,process_dataset_blur,do_pooling_dataset
from pipeline import model_pipeline
from automate_optimal_model_dev import automate_optimal_model_dev
from eval_on_test import make_preds

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.base import clone
from sklearn.metrics import confusion_matrix

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Read in Training Data

In [3]:
training_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_X, train_y = training_data.iloc[:,:-1],training_data.iloc[:,-1]

In [4]:
%%time
all_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_x,train_y = all_data.iloc[:,:-1],all_data.iloc[:,-1]
resized_16_16_train_x = resize_dataset(train_x,(256,256),(16,16))
del all_data
gc.collect()


CPU times: user 3min 7s, sys: 1.25 s, total: 3min 8s
Wall time: 3min 14s


29

### Logistic Regression Test Example

In [5]:
param_grid = {'C':[0.00001,0.0001, 0.001],
    'max_iter':[100]}

In [6]:
%%time

test_model_1 = model_pipeline()
test_model_1_result = test_model_1.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.3]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 34min 30s, sys: 24min 56s, total: 59min 27s
Wall time: 18min 54s


In [7]:
test_model_1_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(),
              param_grid={'C': [1e-05, 0.0001, 0.001], 'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.001),
 'best_params': {'C': 0.001, 'max_iter': 100},
 'best_score': 0.8828109699751993,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      5.834433e-01
 2602    9.182161e-10
 3433    2.946750e-03
 235     9.999996e-01
 1806    9.979477e-01
             ...     
 3330    1.563207e-05
 70      6.152530e-01
 132     9.999999e-01
 2014    9.999998e-01
 1931    1.000000e+00
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8864388092613009,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    

In [8]:
pickle.dump(test_model_1_result,open('linear_model_results/log_reg_test_model_1_result.pkl','wb'))

In [9]:
%%time

test_model_2 = model_pipeline()
test_model_2_result = test_model_2.evaluate(train_X,
                     train_y,
                     preprocessing = [('binarize',[True,0.35]),
                                 ('crop',[(256,256),(256,256)]),
                                 ('blur',['g',(256,256),(5,5),0,0]),
                                 ('pool',[(2,2),np.max])],
                     model = LogisticRegression(),
                     param_grid=param_grid,
                     optimizing_metric='f1',
                     n_splits=5,
                     return_transformed_features = False, 
                     return_grid = True, 
                     return_score = True, 
                     return_best_estimator = True, 
                     return_best_params = True, 
                     return_oos_pred = True, 
                     return_oos_prob = True, 
                     return_threshold_analysis=True)
                        
                           

CPU times: user 33min 43s, sys: 24min 8s, total: 57min 52s
Wall time: 18min 45s


In [10]:
test_model_2_result

{'grid_search': GridSearchCV(cv=5, estimator=LogisticRegression(),
              param_grid={'C': [1e-05, 0.0001, 0.001], 'max_iter': [100]},
              scoring='f1'),
 'best_estimator': LogisticRegression(C=0.001),
 'best_params': {'C': 0.001, 'max_iter': 100},
 'best_score': 0.8828109699751993,
 'oos_preds': 54      1
 2602    0
 3433    0
 235     1
 1806    1
        ..
 3330    0
 70      1
 132     1
 2014    1
 1931    1
 Name: label, Length: 3220, dtype: uint8,
 'oos_probs': 54      5.834433e-01
 2602    9.182161e-10
 3433    2.946750e-03
 235     9.999996e-01
 1806    9.979477e-01
             ...     
 3330    1.563207e-05
 70      6.152530e-01
 132     9.999999e-01
 2014    9.999998e-01
 1931    1.000000e+00
 Name: label, Length: 3220, dtype: float64,
 'threshold_analysis': {'best_thresh': 0.5,
  'best_score': 0.8864388092613009,
  'best_preds': 54      1
  2602    0
  3433    0
  235     1
  1806    1
         ..
  3330    0
  70      1
  132     1
  2014    1
  1931    

In [11]:
pickle.dump(test_model_2_result,open('linear_model_results/log_reg_test_model_1_result.pkl','wb'))

In [None]:
%%time

optimal_test_1_logreg = automate_optimal_model_dev(X = train_X,y = train_y,
                                    model = LogisticRegression(),
                                    param_grid={'C':[0.0001,0.001,0.01,0.1, 1],
                                                'max_iter':[1000]},
                                    preprocessing_eval_order = ['pool','blur','bin/crop'])



Better Model Identified by Resizing Images to (16, 16): 0.8087912087912089
Better Model Identified by Resizing Images to (32, 32): 0.9126807563959956
Better Model Identified by Resizing Images to (64, 64): 0.9338214587440157
Better Model Identified by Resizing Images to (128, 128): 0.940980881130507
Better Model Identified by Resizing Images to (256, 256): 0.9424460431654675
Better Model Identified W/ Pooling, Score = 0.9444597955236254


In [None]:
optimal_test_1_logreg.pop('features')


In [None]:
pickle.dump(optimal_test_1_logreg,open('linear_model_results/optimal_test_1_logreg.pkl','wb'))
print(optimal_test_1_logreg)

In [None]:
%%time

optimal_test_2_logreg = automate_optimal_model_dev(X = train_X,y = train_y,
                                    model = LogisticRegression(),
                                    param_grid={'C':[0.0001,0.001,0.01,0.1, 1],
                                                'max_iter':[1000]},
                                    preprocessing_eval_order = ['blur','bin/crop', 'pool'])



In [None]:
optimal_test_2_logreg.pop('features')

In [None]:

pickle.dump(optimal_test_2_logreg,open('linear_model_results/optimal_test_2_logreg.pkl','wb'))


In [None]:

print(optimal_test_2_logreg)

In [None]:
%%time

optimal_test_3_logreg = automate_optimal_model_dev(X = train_X,y = train_y,
                                    model = LogisticRegression(),
                                    param_grid={'C':[0.0001,0.001,0.01,0.1, 1],
                                                'max_iter':[1000]},
                                    preprocessing_eval_order = ['bin/crop', 'pool', 'blur'])



In [None]:
optimal_test_3_logreg.pop('features')


In [None]:
pickle.dump(optimal_test_3_logreg,open('linear_model_results/optimal_test_3_logreg.pkl','wb'))


In [None]:
print(optimal_test_3_logreg)