In [None]:
import os
import cv2
import numpy as np

rawImages = [] # this refers to normal image flattening
features = [] # this refers to histogram flattening
labels = [] # labels are stored in labels

# flattening the image to one dimensional vector
def flatten_img(image):
    return image.flatten()

# calculating hist form openCV
def calc_histogram(image, bins=(8, 8, 8)):
    # we extract hist form hsv image
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
                        [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return np.array(hist.flatten())


# traversing through training folder and getting path references
folder_url = "/home/yaadava_kishore/Desktop/Data_Analytics_1_b/Fruit-Images-Dataset/Training"
files_list = {}
for i in os.listdir(folder_url):
    files_list[f"{os.path.join(folder_url,i)}"] = os.listdir(
        os.path.join(folder_url, i))

# for every image calculating feature vectors
for dir_path, files in files_list.items():
    label = dir_path.split("/")[-1]
    final_bare_img = np.zeros(30000)
    final_hist_img = np.zeros(512)
    total_imgs = len(files)
    for file in files:
        # for reading the image ,opencv(cv2) module is used.
        img = cv2.imread(os.path.join(dir_path, file))
        pixels = flatten_img(img)
        final_bare_img += pixels
        hist = calc_histogram(img)
        final_hist_img += hist
    final_hist_img = final_hist_img/total_imgs
    final_bare_img = final_bare_img/total_imgs
    rawImages.append(final_bare_img)
    features.append(final_hist_img)
    labels.append(label)
    print(label, " ", total_imgs)
    # print(f'{label} data is processed')

rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)


In [None]:
test_raw_images = []
test_features = []
test_labels = []

# traversing through training folder and getting path references
folder_url = "/home/yaadava_kishore/Desktop/Data_Analytics_1_b/Fruit-Images-Dataset/Test"
files_list = {}
for i in os.listdir(folder_url):
    files_list[f"{os.path.join(folder_url,i)}"] = os.listdir(
        os.path.join(folder_url, i))

# flattenig every image and summing all flattening values
for dir_path, files in files_list.items():
    label = dir_path.split("/")[-1]
    final_bare_img = np.zeros(30000)
    final_hist_img = np.zeros(512)
    total_imgs = len(files)
    for file in files:
        # for reading the image ,opencv(cv2) module is used.
        img = cv2.imread(os.path.join(dir_path, file))
        pixels = flatten_img(img)
        final_bare_img += pixels
        hist = calc_histogram(img)
        final_hist_img += hist
        # rawImages.append(pixels)
        # features.append(hist)
    final_hist_img = final_hist_img/total_imgs
    final_bare_img = final_bare_img/total_imgs
    test_raw_images.append(final_bare_img)
    test_features.append(final_hist_img)
    test_labels.append(label)
    print(label, " ", total_imgs)
    # print(f'{label} data is processed')

test_raw_images = np.array(test_raw_images)
test_features = np.array(test_features)
test_labels = np.array(test_labels)

In [22]:
#### KNN classifier class ####
# very structured(sklearn style) and class based code for knn from scratch is taken from this url:
# https://github.com/python-engineer/MLfromscratch/blob/master/mlfromscratch/knn.py

from collections import Counter
import numpy as np

# def manhattan_distance(x1, x2):
#     return sum(abs(x1-x2))

# from scipy.spatial.distance import euclidean
# dist = euclidean_distance(x1, x2)
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train)
                     for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy*100


In [23]:
X, y = rawImages, labels

k = 10
clf_raw = KNN(k=k)
clf_raw.fit(X, y)

In [24]:
predictions = clf_raw.predict(test_raw_images)
print("KNN classification accuracy of raw Images", accuracy(test_labels, predictions))

KNN classification accuracy of raw Images 73.2824427480916


In [25]:
X, y = features, labels

k = 10
clf_hist = KNN(k=k)
clf_hist.fit(X, y)

In [26]:
predictions = clf_hist.predict(test_features)
print("KNN classification accuracy of raw Images", accuracy(test_labels, predictions))

KNN classification accuracy of raw Images 94.65648854961832
