<a href="https://colab.research.google.com/github/svkrishnaveni/AdaBoost_Using_Numpy/blob/main/animals_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Mount folder from google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
 
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
import os

str_train_dir = "/content/drive/MyDrive/train_size_300"
str_valtest_dir = "/content/drive/MyDrive/valtest_size_300"
def get_data(path):
    class_names = os.listdir(path)
    image_paths = []
    labels = []
    for name in class_names:
        directory = os.path.join(path, name)
        class_path = [os.path.join(directory, f) for f in os.listdir(directory)]
        image_paths += class_path
        labels.extend([name] * len(class_path))

    return image_paths, labels


def get_training_data():
    return get_data(str_train_dir)


def get_test_data():
    return get_data(str_valtest_dir)

import pickle


def save_model(filepath, clf):
    pickle.dump(clf, open(filepath, "wb"))


def load_model(model_path: str):
    with open(model_path, "rb") as model_file:
        model = pickle.load(model_file)

    return model

In [7]:
from typing import List

import cv2
import numpy as np
from scipy.cluster.vq import kmeans, vq


CLUSTER_SIZE = 200


def extract_feature(images: List[str]):
    n = len(images)
    descriptors = []
    orb = cv2.ORB_create()

    # extract features
    for image_path in images:
        img = cv2.imread(image_path)
        features = orb.detect(img, None)
        _, img_descriptor = orb.compute(img, features)
        descriptors.append((image_path, img_descriptor))

    # reformat training descriptors
    concat_descriptors = descriptors[0][1]
    for image_path, descriptor in descriptors[1:]:
        concat_descriptors = np.vstack((concat_descriptors, descriptor))
    concat_descriptors = concat_descriptors.astype(float)

    # k-means clustering
    codebook, _ = kmeans(concat_descriptors, CLUSTER_SIZE, 1)

    # create histogram of training images
    img_features = np.zeros((n, CLUSTER_SIZE), "float32")
    for i in range(n):
        words, distance = vq(descriptors[i][1], codebook)
        for word in words:
            img_features[i][word] += 1

    return img_features, codebook

In [11]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from scipy import stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.svm import SVC as svc 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, roc_auc_score

def get_training_features():
    training_images, training_labels = get_training_data()
    img_features, codebook = extract_feature(training_images)
    return img_features, codebook, training_labels

## set search parameters for SVC classifier

# C: float, default=1.0
# Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.

# kernel {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’
# Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples).

# degreeint, default=3
# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

rand_list_svc = {"C": stats.uniform(2, 10),
             "gamma": stats.uniform(0.1, 1),
             "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
             }

## set search parameters for KNN classifier

# n_neighborsint, default=5
# Number of neighbors to use by default for kneighbors queries.

# weights {‘uniform’, ‘distance’} or callable, default=’uniform’
# Weight function used in prediction. Possible values:

# ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

# ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

rand_list_knn = {
                    'n_neighbors':  [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)],
                    'weights': ['uniform','distance'],
                    'metric': ['euclidean','manhattan','chebyshev','minkowski'] ,
                    }

## set search parameters for Random Forests classifier

# n_estimators int, default=100
# The number of trees in the forest.


# criterion{“gini”, “entropy”, “log_loss”}, default=”gini”
# The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “log_loss” and “entropy” both for the Shannon information gain, see Mathematical formulation. Note: This parameter is tree-specific.

# max_depth int, default=None
# The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

rand_list_rf = {'bootstrap': [True, False],
               'max_depth': [5, 10, 20, 30, 40, 50, None],
               'criterion': ['gini','entropy'],
               'n_estimators': [10, 30, 100, 200]}


def train_model(X,Y,model_str = 'rf'):
   
    if model_str == 'rf':
        mdl = RandomForestClassifier()
        mdl2 = RandomizedSearchCV(mdl, param_distributions = rand_list_rf, n_iter = 200, n_jobs = 4, cv = 3, random_state = 7, scoring = "accuracy") 
        mdl2.fit(X, Y)
        print("Best RF estimator found by Random search:")
        print(mdl2.best_estimator_)
        return mdl2
    elif model_str == 'svm':
        mdl = svc(probability = True)
        mdl2 = RandomizedSearchCV(mdl, param_distributions = rand_list_svc, n_iter = 200, n_jobs = 4, cv = 3, random_state = 7, scoring = "accuracy") 
        mdl2.fit(X, Y)
        print("Best svm estimator found by Random search:")
        print(mdl2.best_estimator_)
        return mdl2
    elif model_str == 'knn':
        mdl = KNeighborsClassifier()
        mdl2 = RandomizedSearchCV(mdl, param_distributions = rand_list_knn, n_iter = 200, n_jobs = 4, cv = 3, random_state = 7, scoring = "accuracy")
        mdl2.fit(X, Y) 
        print("Best knn estimator found by Random search:")
        print(mdl2.best_estimator_)
        return mdl2
    elif model_str == 'nbc':
        mdl2 = GaussianNB()
        mdl2.fit(X, Y)
        return mdl2

def evaluate_performance(mdl, x_test,y_test):
    y_pred = mdl.predict(x_test)
    return y_pred

# print(classification_report(y_test, y_pred, target_names=target_names))
# ConfusionMatrixDisplay.from_estimator(
#     clf, X_test_pca, y_test, display_labels=target_names, xticks_rotation="vertical"
# )
# plt.tight_layout()
# plt.show()


In [12]:
train_features, codebook, train_labels = get_training_features()
test_images, test_labels = get_test_data()


In [16]:
len(test_labels)
len(y_pred)

2

In [19]:
# save train features and codebook
save_model("/content/drive/MyDrive/ML_BOW_trainfeatures.pkl",train_features)
save_model("/content/drive/MyDrive/ML_BOW_codebook.pkl",codebook)

In [20]:
rf_mdl = train_model(train_features,np.array(train_labels),model_str = 'rf')



Best RF estimator found by Random search:
RandomForestClassifier(max_depth=40, n_estimators=200)


In [21]:
save_model("/content/drive/MyDrive/ML_BOW_RFmdl.pkl",rf_mdl)

In [22]:
knn_mdl = train_model(train_features,np.array(train_labels),model_str = 'knn')

Best knn estimator found by Random search:
KNeighborsClassifier(metric='euclidean', n_neighbors=37, weights='distance')


In [23]:
save_model("/content/drive/MyDrive/ML_BOW_KNNmdl.pkl",knn_mdl)

In [24]:
nbc_mdl = train_model(train_features,np.array(train_labels),model_str = 'nbc')

In [25]:
save_model("/content/drive/MyDrive/ML_BOW_NBCmdl.pkl",nbc_mdl)

In [26]:
svm_mdl = train_model(train_features,np.array(train_labels),model_str = 'svm')

Best svm estimator found by Random search:
SVC(C=11.782228970785825, gamma=0.5555849133282118, kernel='poly',
    probability=True)


In [27]:
save_model("/content/drive/MyDrive/ML_BOW_SVMmdl.pkl",svm_mdl)

In [28]:
import cv2
import numpy as np
from scipy.cluster.vq import vq

CLUSTER_SIZE = 200


def evaluate_model(estimator_path: str, codebook_path: str):
    estimator = load_model(estimator_path)
    codebook = load_model(codebook_path)

    orb = cv2.ORB_create()
    test_images, test_labels = get_test_data()
    n = len(test_images)

    test_descriptors = []
    # extract features
    for image_path in test_images:
        img = cv2.imread(image_path)
        features = orb.detect(img, None)
        _, img_descriptor = orb.compute(img, features)
        test_descriptors.append((image_path, img_descriptor))

    img_features = np.zeros((n, CLUSTER_SIZE), "float32")
    # create histogram of test images
    for i in range(n):
        words, distance = vq(test_descriptors[i][1], codebook)
        for word in words:
            img_features[i][word] += 1

    predictions = estimator.predict(img_features)
    correct = 0
    for i in range(n):
        if predictions[i] == test_labels[i]:
            correct += 1

    accuracy = correct / n
    return round(accuracy, 2), predictions

In [29]:
evaluate_model("/content/drive/MyDrive/ML_BOW_RFmdl.pkl", "/content/drive/MyDrive/ML_BOW_codebook.pkl")

(0.39,
 array(['hyena', 'tiger', 'hyena', 'cheetah', 'wolf', 'cheetah', 'cheetah',
        'lion', 'cheetah', 'cheetah', 'hyena', 'cheetah', 'cheetah', 'fox',
        'hyena', 'cheetah', 'wolf', 'cheetah', 'lion', 'cheetah',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'tiger', 'cheetah',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'lion', 'wolf',
        'cheetah', 'cheetah', 'tiger', 'cheetah', 'hyena', 'cheetah',
        'cheetah', 'hyena', 'tiger', 'cheetah', 'hyena', 'hyena',
        'cheetah', 'lion', 'lion', 'cheetah', 'cheetah', 'hyena',
        'cheetah', 'tiger', 'cheetah', 'hyena', 'cheetah', 'cheetah',
        'tiger', 'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah',
        'cheetah', 'cheetah', 'cheetah', 'tiger', 'lion', 'hyena', 'hyena',
        'lion', 'cheetah', 'fox', 'cheetah', 'tiger', 'cheetah', 'hyena',
        'cheetah', 'cheetah', 'tiger', 'hyena', 'tiger', 'tiger', 'tiger',
        'cheetah', 'lion', 'cheetah', 'lion', 'cheetah', 'cheetah'

In [30]:
evaluate_model("/content/drive/MyDrive/ML_BOW_KNNmdl.pkl", "/content/drive/MyDrive/ML_BOW_codebook.pkl")

(0.41, array(['tiger', 'tiger', 'cheetah', 'cheetah', 'wolf', 'cheetah',
        'cheetah', 'wolf', 'cheetah', 'fox', 'cheetah', 'cheetah',
        'cheetah', 'lion', 'lion', 'cheetah', 'cheetah', 'cheetah', 'lion',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah', 'tiger',
        'tiger', 'tiger', 'cheetah', 'cheetah', 'tiger', 'lion', 'cheetah',
        'cheetah', 'cheetah', 'tiger', 'tiger', 'lion', 'cheetah',
        'cheetah', 'tiger', 'tiger', 'cheetah', 'tiger', 'lion', 'cheetah',
        'cheetah', 'hyena', 'cheetah', 'cheetah', 'hyena', 'cheetah',
        'tiger', 'cheetah', 'hyena', 'cheetah', 'cheetah', 'tiger',
        'tiger', 'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah',
        'cheetah', 'cheetah', 'tiger', 'cheetah', 'hyena', 'hyena',
        'hyena', 'cheetah', 'hyena', 'hyena', 'tiger', 'cheetah', 'hyena',
        'tiger', 'cheetah', 'tiger', 'wolf', 'tiger', 'tiger', 'tiger',
        'cheetah', 'lion', 'tiger', 'lion', 'cheetah', 'cheetah',
  

In [31]:
evaluate_model("/content/drive/MyDrive/ML_BOW_NBCmdl.pkl", "/content/drive/MyDrive/ML_BOW_codebook.pkl")

(0.38, array(['cheetah', 'tiger', 'cheetah', 'cheetah', 'wolf', 'cheetah',
        'cheetah', 'tiger', 'cheetah', 'tiger', 'cheetah', 'cheetah',
        'tiger', 'fox', 'hyena', 'cheetah', 'cheetah', 'cheetah', 'lion',
        'cheetah', 'cheetah', 'tiger', 'cheetah', 'cheetah', 'tiger',
        'tiger', 'tiger', 'cheetah', 'cheetah', 'lion', 'hyena', 'hyena',
        'cheetah', 'cheetah', 'tiger', 'tiger', 'hyena', 'hyena',
        'cheetah', 'hyena', 'tiger', 'cheetah', 'tiger', 'lion', 'cheetah',
        'cheetah', 'wolf', 'cheetah', 'cheetah', 'hyena', 'tiger',
        'cheetah', 'cheetah', 'hyena', 'cheetah', 'cheetah', 'tiger',
        'tiger', 'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah',
        'hyena', 'cheetah', 'tiger', 'lion', 'hyena', 'hyena', 'lion',
        'cheetah', 'fox', 'hyena', 'tiger', 'cheetah', 'tiger', 'cheetah',
        'cheetah', 'tiger', 'hyena', 'tiger', 'tiger', 'tiger', 'cheetah',
        'lion', 'tiger', 'fox', 'tiger', 'wolf', 'cheetah', 'chee

In [32]:
evaluate_model("/content/drive/MyDrive/ML_BOW_SVMmdl.pkl", "/content/drive/MyDrive/ML_BOW_codebook.pkl")

(0.41,
 array(['fox', 'cheetah', 'hyena', 'cheetah', 'wolf', 'cheetah', 'cheetah',
        'tiger', 'cheetah', 'cheetah', 'tiger', 'cheetah', 'cheetah',
        'hyena', 'hyena', 'cheetah', 'tiger', 'cheetah', 'lion', 'lion',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah', 'cheetah',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'hyena', 'cheetah',
        'cheetah', 'fox', 'tiger', 'tiger', 'hyena', 'hyena', 'cheetah',
        'fox', 'cheetah', 'cheetah', 'wolf', 'lion', 'wolf', 'cheetah',
        'cheetah', 'cheetah', 'cheetah', 'hyena', 'cheetah', 'tiger',
        'cheetah', 'hyena', 'cheetah', 'cheetah', 'cheetah', 'tiger',
        'cheetah', 'cheetah', 'cheetah', 'cheetah', 'hyena', 'cheetah',
        'cheetah', 'tiger', 'cheetah', 'hyena', 'hyena', 'lion', 'cheetah',
        'hyena', 'cheetah', 'lion', 'cheetah', 'hyena', 'lion', 'cheetah',
        'fox', 'hyena', 'tiger', 'lion', 'tiger', 'cheetah', 'lion',
        'tiger', 'fox', 'cheetah', 'cheetah', 'chee

In [33]:
from time import time
from sklearn.metrics import ConfusionMatrixDisplay

print("Predicting Image labels on the test set")
target_names = ['cheetah','fox','hyena','lion','tiger','wolf']
t0 = time()
acc ,y_pred = evaluate_model("/content/drive/MyDrive/ML_BOW_NBCmdl.pkl", "/content/drive/MyDrive/ML_BOW_codebook.pkl")
print("done in %0.3fs" % (time() - t0))

print(classification_report(test_labels, y_pred, target_names=target_names))
ConfusionMatrixDisplay.from_estimator(
    nbc_mdl, test_images, test_labels, display_labels=target_names, xticks_rotation="vertical"
)
plt.tight_layout()
plt.show()

Predicting Image labels on the test set
done in 8.588s
              precision    recall  f1-score   support

     cheetah       0.66      0.43      0.52       104
         fox       0.31      0.28      0.29        75
       hyena       0.30      0.33      0.31        92
        lion       0.37      0.30      0.33        88
       tiger       0.39      0.72      0.50        81
        wolf       0.29      0.23      0.25        79

    accuracy                           0.38       519
   macro avg       0.39      0.38      0.37       519
weighted avg       0.40      0.38      0.38       519



ValueError: ignored