In [3]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import os
import socket
import cv2
import scipy
from sklearn.model_selection import GridSearchCV

import PIL
from PIL import Image

# get data from folder
def get_data(folder):
    """
    Load the data and labels from the given folder.
    """
    size = 80, 80, 3
    X = []
    y = []
    for wbc_type in os.listdir(folder):
        if not wbc_type.startswith('.'):
            if wbc_type in ['NEUTROPHIL']:
                label = 0
            elif wbc_type in ['EOSINOPHIL']:
                label = 1
            elif wbc_type in ['MONOCYTE']:
                label = 2
            elif wbc_type in ['LYMPHOCYTE']:
                label = 3

            for image_filename in os.listdir(folder + wbc_type):
                img_file = Image.open(folder + wbc_type + '/' + image_filename)
                if img_file is not None:
                    img_file.thumbnail(size)
                    img_arr = np.asarray(img_file)
                    X.append(img_arr)
                    y.append(label)

    X = np.asarray(X)
    print(X.shape)
    y = np.asarray(y)
    nb_classes = 4
    targets = y.reshape(-1)
    yy = np.eye(nb_classes)[targets]
    return X, yy,y

X_train, y_train_nn,y_train = get_data('train//')
X_test, y_test_nn,y_test = get_data('test//')
X_val, y_val_nn,y_val = get_data('validation//')

#Make dataset inputs into correct dimensions form.
def convert(x):
    nsamples, nx, ny,dim = x.shape
    return (x.reshape((nsamples,nx*ny*dim)),x.reshape((nsamples,nx*ny,dim)))

X_train_new=convert(X_train)[0]
X_val_new=convert(X_val)[0]
X_test_new=convert(X_test)[0]

y_trainHot_new=y_train
y_valHot_new=y_val
y_testHot_new=y_test

(1000, 60, 80, 3)
(55, 60, 80, 3)
(55, 60, 80, 3)


In [7]:
#Implement KNN and use GridSearchCV to find the best parameters.
# define parameter
knn = KNeighborsClassifier()

#Define the range of neighbours' number, leaf range of training stop, distance and weights of neighbourss.
k_range = list(range(3,6))
leaf_range = list(range(2,5))
weight_options = ['uniform']
algorithm_options = ['auto']

param_gridknn = dict(n_neighbors = k_range,weights = weight_options,algorithm=algorithm_options,leaf_size=leaf_range)

# grid search for the best hyperparameters
gridKNN = GridSearchCV(knn,param_gridknn,cv=10,scoring='accuracy',verbose=1)
gridKNN.fit(X_train_new,y_trainHot_new)

print('best score is:',str(gridKNN.best_score_))
print('best params are:',str(gridKNN.best_params_))

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 19.9min finished


best score is: 0.539
best params are: {'algorithm': 'auto', 'leaf_size': 2, 'n_neighbors': 3, 'weights': 'uniform'}


In [8]:
# use the best hyperparameters to predict testing dataset
from sklearn import metrics

knn_pre=KNeighborsClassifier(n_neighbors=3,leaf_size=2,weights='uniform',algorithm='auto')
knn_pre.fit(X_train_new,y_trainHot_new)
y_pred=knn_pre.predict(X_test_new)

print(metrics.accuracy_score(y_testHot_new,y_pred))

0.5272727272727272
