In [None]:
!pip install imutils

In [None]:
# import the necessarry packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import pandas as pd
import numpy as np 
import imutils
import cv2
import os

### Next, we are going to define two methods to take an input image and convert it to a feature vector, or a list of numbers that quantify the contents of an image. The first method can be seen below:

In [None]:
def image_to_feature_vector(image, size=(32, 32)):
	# resize the image to a fixed size, then flatten the image into a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

### We then define our second method, this one called extract_color_histogram :

In [None]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)
	# return the flattened histogram as the feature vector
	return hist.flatten()

In [None]:
dataset = "/kaggle/working/train"
neighbors = 2

### We are now ready to prepare our images for feature extraction:

In [None]:
# grab the list of images that we'll be describing
print("[INFO] describing images...")
imagePaths = list(paths.list_images(dataset))
print("[INFO] dataset has {} images".format(len(imagePaths)))

In [None]:
# initialize the raw pixel intensities matrix, the features matrix, and labels list
rawImages = []
features = []
labels = []

### Let’s move on to extracting features from our dataset:

In [None]:
# loop over the input images
for (i, imagePath) in enumerate(imagePaths):
	# load the image and extract the class label (assuming that our
	# path as the format: /path/to/dataset/{class}.{image_num}.jpg
	image = cv2.imread(imagePath)
	label = imagePath.split(os.path.sep)[-1].split(".")[0]
    
	# extract raw pixel intensity "features", followed by a color
	# histogram to characterize the color distribution of the pixels
	# in the image
	pixels = image_to_feature_vector(image)
	hist = extract_color_histogram(image)
    
	# update the raw images, features, and labels matricies, respectively
	rawImages.append(pixels)
	features.append(hist)
	labels.append(label)
    
	# show an update every 1,000 images
	if i > 0 and i % 1000 == 0:
		print("[INFO] processed {}/{}".format(i, len(imagePaths)))

### You might be curious how much memory our rawImages  and features  matrices take up — the following code block will tell us when executed:

In [None]:
# show some information on the memory consumed by the raw images matrix and features matrix
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)
print("[INFO] pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

### Next, we need to take our data partition it into two splits — one for training and another for testing:

In [None]:
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
(trainRI, testRI, trainRL, testRL) = train_test_split(
	rawImages, labels, test_size=0.25, random_state=42)
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
	features, labels, test_size=0.25, random_state=42)

### Let’s apply the k-NN classifier to the raw pixel intensities:

In [None]:
# train and evaluate a k-NN classifer on the raw pixel intensities
print("[INFO] evaluating raw pixel accuracy...")
model = KNeighborsClassifier(n_neighbors=neighbors,
	n_jobs=-1)
model.fit(trainRI, trainRL)
acc = model.score(testRI, testRL)
print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))

In [None]:
# train and evaluate a k-NN classifer on the raw pixel intensities
print("[INFO] evaluating raw pixel accuracy...")
model = KNeighborsClassifier(n_neighbors=neighbors,
	n_jobs=-1)
model.fit(trainFeat, trainLabels)
acc = model.score(testFeat, testLabels)
print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))

As the figure above demonstrates, by utilizing raw pixel intensities we were able to reach 53.55% accuracy. On the other hand, applying k-NN to color histograms achieved a slightly better 58.27% accuracy.

In both cases, we were able to obtain > 50% accuracy, demonstrating there is an underlying pattern to the images for both raw pixel intensities and color histograms.

However, that 58% accuracy leaves much to be desired.

And as you might imagine, color histograms aren’t the best way to distinguish between a dog and a cat:

There are brown dogs. And there are brown cats.
There are black dogs. And there are black cats.
And certainly a dog and cat could appear in the same environment (such as a house, park, beach, etc.) where the background color distributions are similar.
Because of this, utilizing strictly color is not a great choice for characterizing the difference between dogs and cats — but that’s okay. The purpose of this blog post was simply to introduce the concept of image classification using the k-NN algorithm.


In [None]:
testDataset = "../working/test1"

In [None]:
# initialize the raw pixel intensities matrix, the features matrix, and labels list
testPaths = list(paths.list_images(testDataset))
testImages = []
testFeatures = []
# testLabels = []

In [None]:
# loop over the input images
for (i, imagePath) in enumerate(testPaths):
	# load the image and extract the class label (assuming that our
	# path as the format: /path/to/dataset/{class}.{image_num}.jpg
	image = cv2.imread(imagePath)
# 	label = imagePath.split(os.path.sep)[-1].split(".")[0]
    
	# extract raw pixel intensity "features", followed by a color
	# histogram to characterize the color distribution of the pixels
	# in the image
	pixels = image_to_feature_vector(image)
	hist = extract_color_histogram(image)
    
	# update the raw images, features, and labels matricies, respectively
	testImages.append(pixels)
	testFeatures.append(hist)
# 	testLabels.append(label)
    
	# show an update every 1,000 images
	if i > 0 and i % 1000 == 0:
		print("[INFO] processed {}/{}".format(i, len(testPaths)))

In [None]:
pred = model.predict(testFeatures)
pred = np.array([0 if x == "dog" else 1 for x in pred ])

In [None]:
pred