# Import Necessary Packages and Libraries

In [2]:
# IMAGE PREPROCESSING FUNCTIONS FOR USE IN MODEL DEVELOPMENT, EVALUATION, AND PRODUCTION
import numpy as np
import pandas as pd
import PIL as pil
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join
import tempfile
import pickle
import time
import gc
import skimage.filters
import cv2
import watermark
import joblib
import math
from skimage.measure import block_reduce
from image_preprocessing import standardize_image_dataset,resize_dataset,binarize_dataset,crop_dataset,process_dataset_blur,do_pooling_dataset
from pipeline import model_pipeline

from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.metrics import accuracy_score,f1_score
from sklearn.base import clone

# Read in Training Data

In [3]:
all_data = pickle.load(open('Amit/Labeled Data/train_data.pkl','rb'))
train_x,train_y = all_data.iloc[:,:-1],all_data.iloc[:,-1]
del all_data
gc.collect()

0

# Automated Model Development Pipeline

Functional wrapper around model_pipeline class allowing for the automated identification of an optimal model and hyperparameter settings in conjunction with a greedy approach for identifying optimal preprocessing steps. 

The greedy approach is achieved by first identifying an optimal image size, or an image size that yields the best performing model without being too large to the point where it dramatically slows down training time and potentially reduces optimal model performance. Next, a specific preprocessing methodology is incorporated across a variety of preprocessing settings. For each of these settings, our image vectors / features are preprocessed according to these settings and an optimal model is identified. At the completion of evaluating a specific preprocessing methodology, if the optimal model identified is better than the previously identified optimal model trained on resized image data, the optimal model parameters are replaced. In addition, the features this model was trained on according to associated preprocessing settings are permanently applied to the features (Greedy). This process continues with other preprocessing methodologies as defined by the user, where at each step the pipeline identifies whether any additional preprocessing steps yield an improved model given previously incorporated preprocessing steps.

In [4]:
def automate_optimal_model_dev(X,y,model,param_grid,preprocessing_eval_order = ['bin/crop','blur','pool'],resize=True):
    
    #Store global values to capture optimal model/parameters, key performance metrics, and optimal preprocessing steps
    best_feats = X.copy()
    best_model = None
    best_params = None
    best_probs = None
    best_preds = None
    best_thresh = None
    best_score = 0
    best_preprocess = '(Initial Standardization/Resizing to ' + str((int(np.sqrt(X.shape[1])),int(np.sqrt(X.shape[1])))) + ')'
    # Initial Evaluation - Identify Optimal Size of Images, measured by performance of optimal model yielded by training on 
    #images of various
    
    if resize == True:

        for img_size in [(2**num,2**num) for num in range(4,int(math.log2(np.sqrt(X.shape[1]))) + 1)]:

            resize_results = model_pipeline().evaluate(resize_dataset(X,(int(np.sqrt(X.shape[1])),int(np.sqrt(X.shape[1]))),img_size),
                                                       y,preprocessing=[],model=model,param_grid=param_grid,
                                                       optimizing_metric='f1',n_splits=5,return_transformed_features=True,
                                                       return_grid=True,return_score=True,return_best_estimator=True,
                                                       return_best_params=True,return_oos_pred=True,return_oos_prob=True,
                                                       return_threshold_analysis=True)
            score = resize_results['threshold_analysis']['best_score'] #get f1 score of best performing model trained on images
            #of specified size

            if score - best_score > 0.001:

            #Extract key results and the model for best performing model if model is at minimum > 0.01 in F1 score performance
            # then previously identified optimal model
                best_feats = resize_results['features']
                best_model = resize_results['best_estimator']
                best_params = resize_results['best_params']
                best_probs = resize_results['oos_probs']
                best_preds = resize_results['threshold_analysis']['best_preds']
                best_thresh = resize_results['threshold_analysis']['best_thresh']
                best_score = score
                best_preprocess = '(Initial Standardization/Resizing to ' + str(img_size) + ')'
                print('Better Model Identified by Resizing Images to ' + str(img_size) + ': ' + str(score))
            else: 
                #break early if increasing image size does not yield a significantly better performing optimal model
                break
    
    #detect size of images for binarization/cropping and blur preprocessing steps
    image_size = int(np.sqrt(best_feats.shape[1]))
    
    binarization_crop_settings = [[('binarize',[True,0.3]),('crop',[(image_size,image_size),(image_size,image_size)])],
                                         [('binarize',[False,0.05]),('crop',[(image_size,image_size),(image_size,image_size)])],
                                         [('binarize',[False,0.1]),('crop',[(image_size,image_size),(image_size,image_size)])],
                                         [('binarize',[False,0.15]),('crop',[(image_size,image_size),(image_size,image_size)])],
                                         [('binarize',[False,0.2]),('crop',[(image_size,image_size),(image_size,image_size)])],
                                         [('binarize',[False,0.3]),('crop',[(image_size,image_size),(image_size,image_size)])]]
    
    blur_settings = [[('blur',['g',(image_size,image_size),(3,3),0,0])],
                     [('blur',['g',(image_size,image_size),(3,3),1,0])],
                     [('blur',['g',(image_size,image_size),(3,3),0,1])],
                     [('blur',['g',(image_size,image_size),(3,3),1,1])],
                     [('blur',['g',(image_size,image_size),(3,3),2,2])],
                     [('blur',['g',(image_size,image_size),(5,5),0,0])],
                     [('blur',['g',(image_size,image_size),(5,5),1,0])],
                     [('blur',['g',(image_size,image_size),(5,5),0,1])],
                     [('blur',['g',(image_size,image_size),(5,5),1,1])],
                     [('blur',['g',(image_size,image_size),(5,5),2,2])],
                     [('blur',['b',(image_size,image_size),(3,3),0,0])],
                     [('blur',['b',(image_size,image_size),(3,3),1,0])],
                     [('blur',['b',(image_size,image_size),(3,3),0,1])],
                     [('blur',['b',(image_size,image_size),(3,3),1,1])],
                     [('blur',['b',(image_size,image_size),(3,3),2,2])],
                     [('blur',['b',(image_size,image_size),(5,5),0,0])],
                     [('blur',['b',(image_size,image_size),(5,5),1,0])],
                     [('blur',['b',(image_size,image_size),(5,5),0,1])],
                     [('blur',['b',(image_size,image_size),(5,5),1,1])],
                     [('blur',['b',(image_size,image_size),(5,5),2,2])]]
    
    
    #detect possible pooling settings dependent on image_size, controls possible pool sizes
    pool_ranges = int(math.log2(image_size))
    pool_settings = []
    for num in range(1,pool_ranges):
        pool_settings.append([('pool',[(2**num,2**num),np.max])])
        pool_settings.append([('pool',[(2**num,2**num),np.mean])])
    
    for step in preprocessing_eval_order:
        #Identify optimal model considering different image preprocessing settings to also identify
        #optimal preprocessing settings
        
        #set settings we will evaluate depending on the user defined preprocessing evaluation order
        if step == 'bin/crop':
            settings = binarization_crop_settings
        elif step == 'blur':
            settings = blur_settings
        elif step == 'pool':
            settings = pool_settings
        
        best_setting = ''
        best_setting_feats = None

        #For each preprocessing setting, identify an optimal performing model trained on
        #transformed features according to specified preprocessing. Compare each model to currently identified
        #optimal model and replace if better model is found
        for setting in settings:
            if step == 'bin/crop' and int(np.sqrt(best_feats.shape[1])) != image_size: #if pooling was evaluated first and
                #yielded a model better than base case, resulting data would have been resized so dimension settings for
                #binarization, cropping will need to be adjusted
                new_image_size = int(np.sqrt(best_feats.shape[1]))
                setting[1][1][0] = (new_image_size,new_image_size)
                setting[1][1][1] = (new_image_size,new_image_size)
            elif step == 'blur' and int(np.sqrt(best_feats.shape[1])) != image_size: #same case as above but for blurring
                new_image_size = int(np.sqrt(best_feats.shape[1]))
                setting[0][1][1] = (new_image_size,new_image_size)
            setting_case = model_pipeline().evaluate(best_feats,y,preprocessing=setting,model=model,param_grid=param_grid,
                                                      optimizing_metric='f1',n_splits=5,return_transformed_features=True,
                                                      return_grid=True,return_score=True,return_best_estimator=True,
                                                      return_best_params=True,return_oos_pred=True,return_oos_prob=True,
                                                      return_threshold_analysis=True)
            score = setting_case['threshold_analysis']['best_score']#get F1 score of optimal model trained using preprocessed features
            if score > best_score: #if score is better than current best score, update key results and model for optimal performing model
                best_model = setting_case['best_estimator']
                best_params = setting_case['best_params']
                best_probs = setting_case['oos_probs']
                best_preds = setting_case['threshold_analysis']['best_preds']
                best_thresh = setting_case['threshold_analysis']['best_thresh']
                best_score = score
                best_setting_feats = setting_case['features']
                if step == 'bin/crop':
                    best_setting = '(Binarization, Automate Threshold = ' + str(setting[0][1][0]) + ', Threshold = ' + str(setting[0][1][1]) + ') (Crop, ' + str(setting[1][1][0]) + ', ' + str(setting[1][1][0]) + ')'
                    print('Better Model Identified W/ Binarization/Cropping, Score = ' + str(score))
                elif step == 'blur':
                    best_setting = '(Blurring, Type = ' + str(setting[0][1][0]) + ', Dimension = ' + str(setting[0][1][1]) + ', Kernel = ' + str(setting[0][1][2]) + ', sigma_x = ' + str(setting[0][1][3]) + ', sigma_y = ' + str(setting[0][1][4]) + ')'
                    print('Better Model Identified W/ Blurring, Score = ' + str(score))
                elif step == 'pool':
                    best_setting = '(Pool, pool_size = ' + str(setting[0][1][0]) + ', pooling_function = ' + str(setting[0][1][1]) + ')'
                    print('Better Model Identified W/ Pooling, Score = ' + str(score))
                    

        #Update features and preprocessing string if incorporating specific preprocessing as part of image preprocessing pipeline yielded 
        #a better performing model. This ensures these steps do not need to be repeated when evaluating additional 
        #preprocessing steps
        if best_setting != '':
            best_feats = best_setting_feats
            best_preprocess = best_preprocess + best_setting
    
    
    #store and return optimal model, threshold, out of sample predictions, features the model was trained on, 
    #and optimal preprocessing steps identified via a greedy sequential decision process
    return_dict = {}
    return_dict['features'] = best_feats
    return_dict['best_model'] = best_model
    return_dict['best_params'] = best_params
    return_dict['oos_probs'] = best_probs
    return_dict['oos_preds'] = best_preds
    return_dict['best_thresh'] = best_thresh
    return_dict['best_score'] = best_score
    return_dict['best_preprocess'] = best_preprocess
    
    return return_dict

# Identify Optimal KNN Alongside Optimized Preprocessing Steps

[1,2,3],[1,3,2],[2,1,3],[2,3,1],[3,1,2],[3,2,1]  
1 = bin/crop  
2 = blur  
3 = pool

### Test 1: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [3,2,1]

In [4]:
%%time
knn_test_1 = automate_optimal_model_dev(X = train_x, y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['pool','blur','bin/crop'])
with open('KNN_auto_resize_results/knn_test_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_1, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified by Resizing Images to (16, 16): 0.9518106643855146
Better Model Identified W/ Blurring, Score = 0.9531116794543905
Better Model Identified W/ Binarization/Cropping, Score = 0.9619473978735311
CPU times: user 7min 58s, sys: 13 s, total: 8min 11s
Wall time: 4min 16s


### Test 2: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [3,1,2], Test 1 yielded that resize value of 16x16 is optimal 

In [5]:
%%time
knn_test_2 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['pool','bin/crop','blur'],
                                        resize = False)
with open('KNN_auto_resize_results/knn_test_2.pickle', 'wb') as handle:
    pickle.dump(knn_test_2, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified W/ Pooling, Score = 0.9480929165471753
Better Model Identified W/ Binarization/Cropping, Score = 0.9630044843049327
Better Model Identified W/ Binarization/Cropping, Score = 0.9632744603308102
CPU times: user 5min 18s, sys: 8.38 s, total: 5min 27s
Wall time: 2min 36s


In [4]:
%%time
knn_test_2_1 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                          y = train_y,
                                          model = KNeighborsClassifier(),
                                          param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                          preprocessing_eval_order = ['pool','bin/crop','blur'])
with open('KNN_auto_resize_results/knn_test_2_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_2_1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 5min 57s, sys: 9.08 s, total: 6min 6s
Wall time: 2min 48s


### Test 3: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [2,1,3], Test 1 yielded that resize value of 16x16 is optimal 

In [6]:
%%time
knn_test_3 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['blur','bin/crop','pool'],
                                        resize = False)
with open('KNN_auto_resize_results/knn_test_3.pickle', 'wb') as handle:
    pickle.dump(knn_test_3, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified W/ Blurring, Score = 0.9531116794543905
Better Model Identified W/ Binarization/Cropping, Score = 0.9619473978735311
CPU times: user 6min 15s, sys: 10.7 s, total: 6min 26s
Wall time: 3min


In [5]:
%%time
knn_test_3_1 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)),
                                          y = train_y,
                                          model = KNeighborsClassifier(),
                                          param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                          preprocessing_eval_order = ['blur','bin/crop','pool'])
with open('KNN_auto_resize_results/knn_test_3_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_3_1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 6min 10s, sys: 11.2 s, total: 6min 22s
Wall time: 2min 59s


### Test 4: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [1,2,3], Test 1 yielded that resize value of 16x16 is optimal 

In [7]:
%%time
knn_test_4 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['bin/crop','blur','pool'],
                                        resize = False)
with open('KNN_auto_resize_results/knn_test_4.pickle', 'wb') as handle:
    pickle.dump(knn_test_4, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified W/ Binarization/Cropping, Score = 0.9468926553672317
Better Model Identified W/ Binarization/Cropping, Score = 0.9650153932269802
CPU times: user 6min 5s, sys: 10.1 s, total: 6min 15s
Wall time: 2min 55s


In [6]:
%%time
knn_test_4_1 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                          y = train_y,
                                          model = KNeighborsClassifier(),
                                          param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                          preprocessing_eval_order = ['bin/crop','blur','pool'])
with open('KNN_auto_resize_results/knn_test_4_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_4_1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 5min 59s, sys: 9.15 s, total: 6min 8s
Wall time: 2min 50s


### Test 5: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [1,3,2], Test 1 yielded that resize value of 16x16 is optimal 

In [8]:
%%time
knn_test_5 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['bin/crop','pool','blur'],
                                        resize = False)
with open('KNN_auto_resize_results/knn_test_5.pickle', 'wb') as handle:
    pickle.dump(knn_test_5, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified W/ Binarization/Cropping, Score = 0.9468926553672317
Better Model Identified W/ Binarization/Cropping, Score = 0.9650153932269802
CPU times: user 6min 5s, sys: 10 s, total: 6min 15s
Wall time: 2min 55s


In [7]:
%%time
knn_test_5_1 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                          y = train_y,
                                          model = KNeighborsClassifier(),
                                          param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                          preprocessing_eval_order = ['bin/crop','pool','blur'])
with open('KNN_auto_resize_results/knn_test_5_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_5_1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 5min 59s, sys: 9.18 s, total: 6min 8s
Wall time: 2min 49s


### Test 6: Identify optimal KNN while sequentially identifying optimal settings for bin/crop, blur, and pooling [2,3,1], Test 1 yielded that resize value of 16x16 is optimal 

In [9]:
%%time
knn_test_6 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = ['blur','pool','bin/crop'],
                                        resize = False)
with open('KNN_auto_resize_results/knn_test_6.pickle', 'wb') as handle:
    pickle.dump(knn_test_6, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified W/ Blurring, Score = 0.9531116794543905
Better Model Identified W/ Binarization/Cropping, Score = 0.9619473978735311
CPU times: user 6min 14s, sys: 9.72 s, total: 6min 23s
Wall time: 2min 57s


In [8]:
%%time
knn_test_6_1 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                          y = train_y,
                                          model = KNeighborsClassifier(),
                                          param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                          preprocessing_eval_order = ['blur','pool','bin/crop'])
with open('KNN_auto_resize_results/knn_test_6_1.pickle', 'wb') as handle:
    pickle.dump(knn_test_6_1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 6min 1s, sys: 9.43 s, total: 6min 10s
Wall time: 2min 50s


### Without preprocessing

### resize twice yield best f1

In [10]:
%%time
knn_test_7 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = [])
with open('KNN_auto_resize_results/knn_test_7.pickle', 'wb') as handle:
    pickle.dump(knn_test_7, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 1min 12s, sys: 2.58 s, total: 1min 14s
Wall time: 1min 9s


In [11]:
%%time
knn_test_8 = automate_optimal_model_dev(X = train_x, y = train_y,
                                        model = KNeighborsClassifier(),
                                        param_grid={'n_neighbors':[1,3,5,7,9,11,13]},
                                        preprocessing_eval_order = [])
with open('KNN_auto_resize_results/knn_test_8.pickle', 'wb') as handle:
    pickle.dump(knn_test_8, handle, protocol=pickle.HIGHEST_PROTOCOL)

Better Model Identified by Resizing Images to (16, 16): 0.9518106643855146
CPU times: user 2min 27s, sys: 4.19 s, total: 2min 31s
Wall time: 2min 15s


### other metric

#### weights

In [7]:
%%time
knn_test_9 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                        y = train_y,
                                        model = KNeighborsClassifier(weights='distance'),
                                        param_grid={'n_neighbors':[1,3,5]},
                                        preprocessing_eval_order = ['bin/crop','blur','pool'])
knn_test_9

Better Model Identified by Resizing Images to (16, 16): 0.971894322653176
CPU times: user 3min 38s, sys: 7.74 s, total: 3min 46s
Wall time: 2min 17s


{'features':       pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
 54         0       0       1       1       0       7      29      31      29   
 2602       1       1       3       3       1       3      32      52      42   
 3433       0       1       3       3       1       3       0       0       0   
 235        0      11      38      30      44      62      54      54      54   
 1806       0       4       4       4       4       4       4       4       4   
 ...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
 3330       0       0       0       0       0       0       0       0       0   
 70        25       0       2       3       2       1       4      11      11   
 132        0       0       0       0       0       8      32      62      78   
 2014       5       5       5       5       5       5       5       5       5   
 1931       1       1       1       0       0      79     176     210     217   
 
       pixel9 

#### Manhattan distance

In [10]:
%%time
knn_test_10 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                        y = train_y,
                                        model = KNeighborsClassifier(p=1, metric='minkowski'),
                                        param_grid={'n_neighbors':[1,3,5]},
                                        preprocessing_eval_order = ['bin/crop','blur','pool'])
knn_test_10

Better Model Identified by Resizing Images to (16, 16): 0.9742441209406496
CPU times: user 6min 25s, sys: 7.16 s, total: 6min 32s
Wall time: 6min 34s


{'features':       pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
 54         0       0       1       1       0       7      29      31      29   
 2602       1       1       3       3       1       3      32      52      42   
 3433       0       1       3       3       1       3       0       0       0   
 235        0      11      38      30      44      62      54      54      54   
 1806       0       4       4       4       4       4       4       4       4   
 ...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
 3330       0       0       0       0       0       0       0       0       0   
 70        25       0       2       3       2       1       4      11      11   
 132        0       0       0       0       0       8      32      62      78   
 2014       5       5       5       5       5       5       5       5       5   
 1931       1       1       1       0       0      79     176     210     217   
 
       pixel9 

In [11]:
with open('KNN_auto_resize_results/knn_test_10.pickle', 'wb') as handle:
    pickle.dump(knn_test_10, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
%%time
knn_test_11 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                        y = train_y,
                                        model = KNeighborsClassifier(metric='chebyshev'),
                                        param_grid={'n_neighbors':[1,3,5]},
                                        preprocessing_eval_order = ['bin/crop','blur','pool'])
knn_test_11

Better Model Identified by Resizing Images to (16, 16): 0.9454648205707827
Better Model Identified W/ Blurring, Score = 0.9474282466609832
Better Model Identified W/ Blurring, Score = 0.9477272727272726
Better Model Identified W/ Blurring, Score = 0.950354609929078
CPU times: user 6min 46s, sys: 8.91 s, total: 6min 55s
Wall time: 6min 59s


{'features':         pixel0     pixel1     pixel2     pixel3     pixel4      pixel5  \
 54    0.000000   0.319168   0.477096   6.358317  22.491224   48.009315   
 2602  1.814945   2.100062   2.769137   3.584081  11.200088   26.067818   
 3433  0.230863   1.088305   2.157928   2.157928   1.434600    9.472224   
 235   6.614219  13.634724  25.104576  38.379742  54.566586   73.992744   
 1806  2.553342   2.723329   4.000000   4.000000   4.000000    4.000000   
 ...        ...        ...        ...        ...        ...         ...   
 3330  0.000000   0.000000   0.000000   0.000000   0.000000    0.000000   
 70    9.679947   9.255865   2.115432   3.380345  10.054217   20.747427   
 132   0.000000   0.000000   0.000000   3.463514  19.109657   46.367668   
 2014  6.276671   6.276671   6.276671   6.276671   5.869199    4.388792   
 1931  1.000000   0.796264   2.894802  36.094780  85.052208  133.828186   
 
           pixel6      pixel7      pixel8      pixel9  ...    pixel246  \
 54     62.9

In [13]:
%%time
knn_test_12 = automate_optimal_model_dev(X = resize_dataset(train_x,(256,256),(16,16)), 
                                        y = train_y,
                                        model = KNeighborsClassifier(metric='minkowski'),
                                        param_grid={'n_neighbors':[1,3,5], 'p':[1,2], 'weights':['uniform','distance']},
                                        preprocessing_eval_order = ['bin/crop','blur','pool'])
knn_test_12

Better Model Identified by Resizing Images to (16, 16): 0.9742441209406496
CPU times: user 12min 3s, sys: 11.8 s, total: 12min 15s
Wall time: 9min 38s


{'features':       pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
 54         0       0       1       1       0       7      29      31      29   
 2602       1       1       3       3       1       3      32      52      42   
 3433       0       1       3       3       1       3       0       0       0   
 235        0      11      38      30      44      62      54      54      54   
 1806       0       4       4       4       4       4       4       4       4   
 ...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
 3330       0       0       0       0       0       0       0       0       0   
 70        25       0       2       3       2       1       4      11      11   
 132        0       0       0       0       0       8      32      62      78   
 2014       5       5       5       5       5       5       5       5       5   
 1931       1       1       1       0       0      79     176     210     217   
 
       pixel9 

In [14]:
with open('KNN_auto_resize_results/knn_test_12.pickle', 'wb') as handle:
    pickle.dump(knn_test_12, handle, protocol=pickle.HIGHEST_PROTOCOL)