In [9]:
import cv2
import numpy as np
import os
import sys

from PIL import Image
from sklearn.model_selection import train_test_split
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data_dir = './coins_dataset_combined' #data directory
IMG_WIDTH = 300 
IMG_HEIGHT = 300
NUM_CATEGORIES = 3
TEST_SIZE = 0.25
mapping_label = {'10c':0,
  '20c':1,
  '5c':2}

def load_data(dir):
    # initialise list of images
    images = []
    # initialise list of labels
    labels = []
    for root, dirs, files in os.walk(dir):
        
        for name in files:
            path = os.path.join(root, name)
            img = Image.open(path)
            img=img.rotate(4)
            area = (400, 250, 950, 800)
            img = img.crop(area)
            
            pil_image = img.convert('RGB') 
            open_cv_image = np.array(pil_image) 
            # Convert RGB to BGR 
            resized_image = open_cv_image[:, :, ::-1].copy() 

            # To pass these images into a neural network, the images will need to be the same size
            resized_image = cv2.resize(resized_image, (IMG_WIDTH, IMG_HEIGHT))
            
            # add resized image to images list
            images.append(resized_image)
            # add category to labels array
            path=os.path.dirname(path)
            labels.append(mapping_label[os.path.basename(path)])
    return images,labels

images, labels = load_data(data_dir)
print(images)
print(labels)

[array([[[13, 10,  9],
        [15, 11, 10],
        [16, 13, 12],
        ...,
        [12, 12,  5],
        [11, 12,  7],
        [12, 12,  6]],

       [[14, 11, 10],
        [16, 12, 11],
        [13, 10,  9],
        ...,
        [12, 13,  5],
        [13, 13,  7],
        [13, 13,  7]],

       [[16, 12, 11],
        [14, 10,  9],
        [17, 13, 12],
        ...,
        [11, 11,  5],
        [13, 12,  8],
        [12, 11,  6]],

       ...,

       [[14, 18,  9],
        [15, 18, 11],
        [15, 20, 11],
        ...,
        [12, 13,  3],
        [11, 13,  4],
        [10, 11,  2]],

       [[18, 20, 12],
        [16, 22, 12],
        [13, 18,  9],
        ...,
        [ 9, 11,  4],
        [10, 12,  5],
        [ 9, 12,  3]],

       [[17, 20, 13],
        [13, 18,  9],
        [13, 16, 10],
        ...,
        [10, 11,  4],
        [ 8, 10,  2],
        [ 8, 10,  3]]], dtype=uint8), array([[[26, 19, 16],
        [25, 18, 15],
        [22, 17, 14],
        ...,
        [23

In [3]:
labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(
    np.array(images), np.array(labels), test_size=TEST_SIZE, random_state=3237
)
print(x_train.shape,y_train.shape, x_test.shape,y_test.shape)


(229, 300, 300, 3) (229, 3) (77, 300, 300, 3) (77, 3)


In [4]:
#sklearn expects i/p to be 2d array-model.fit(x_train,y_train)=>reshape to 2d array
nsamples, nx, ny, nrgb = x_train.shape
x_train2 = x_train.reshape((nsamples,nx*ny*nrgb))
x_train2.shape
# The above code reshapes train set images from (229,300,300,3) which is a 4D array to (229,270000),
# a 2D array. 270000 is obtained by multiplying the dimensions of the image(300x300x3=270000).

(229, 270000)

In [5]:
#so,eventually,model.predict() should also be a 2d input
nsamples, nx, ny, nrgb = x_test.shape
x_test2 = x_test.reshape((nsamples,nx*ny*nrgb))
x_test2.shape

(77, 270000)

In [None]:
# Instantiate class. Using random_state=2 for you to be able to reproduce the same result
rf = RandomForestClassifier(n_estimators=25, criterion='gini', max_depth=40, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=3237, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

In [10]:
rf=RandomForestClassifier(n_estimators=10, random_state=3237)

In [11]:
rf.fit(x_train2,y_train)
print(rf.score(x_test2, y_test))

0.7402597402597403


In [12]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 1000, num = 15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [1, 72, 143, 215, 286, 357, 429, 500, 571, 643, 714, 785, 857, 928, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state=3237)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=3237, n_jobs = -1)
# Fit the random search model
random_search = rf_random.fit(x_train2,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.8min finished


In [12]:
accuracy = random_search.best_score_ *100
accuracy

79.06128958760537

In [13]:
rf_random.best_params_
# {'n_estimators': 89,
#  'min_samples_split': 2,
#  'min_samples_leaf': 2,
#  'max_features': 'auto',
#  'max_depth': 60,
#  'bootstrap': False}

# {'n_estimators': 429,
#  'min_samples_split': 5,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': None,
#  'bootstrap': False}

{'n_estimators': 429,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

In [28]:
# rf=RandomForestClassifier(n_estimators= 89, 
#                           min_samples_split= 2,
#                           min_samples_leaf= 2,
#                           max_features= 'auto',
#                           max_depth= 60,
#                           bootstrap= False)

rf=RandomForestClassifier(n_estimators= 429, 
                          min_samples_split= 5,
                          min_samples_leaf= 1,
                          max_features= 'auto',
                          max_depth= None,
                          bootstrap= False)

In [29]:
rf.fit(x_train2,y_train)
print(rf.score(x_test2, y_test))

0.8961038961038961


In [37]:
import joblib
# now you can save it to a file
joblib.dump(rf, 'rf.pkl') 
# and later you can load it
# rf = joblib.load('rf.pkl')

['knn.pkl']