In [1]:
"""
All cell images resized to 128 x 128
Images used for test are completely different that the ones used for training.
136 images for testing, each parasitized and uninfected (136 x 2)
104 images for training, each parasitized and uninfected (104 x 2)

Cannot import lots of data to Github, so uploaded 10 images of each.
Download full dataset from: ftp://lhcftp.nlm.nih.gov/Open-Access-Datasets/Malaria/cell_images.zip
"""


import cv2
import numpy as np
import os


# Get the training classes names and store them in a list
#Here we use folder names for class names

#train_path = 'dataset/train'  # Names are Aeroplane, Bicycle, Car
train_path = 'images/cell_images/train'  # Folder Names are Parasitized and Uninfected
training_names = os.listdir(train_path)

# Get path to all images and save them in a list
# image_paths and the corresponding label in image_paths
image_paths = []
image_classes = []
class_id = 0

#To make it easy to list all file names in a directory let us define a function
#
def imglist(path):    
    return [os.path.join(path, f) for f in os.listdir(path)]

#Fill the placeholder empty lists with image path, classes, and add class ID number
#
    
for training_name in training_names:
    dir = os.path.join(train_path, training_name)
    class_path = imglist(dir)
    image_paths+=class_path
    image_classes+=[class_id]*len(class_path)
    class_id+=1

In [26]:
print(image_classes)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [2]:
# Create feature extraction and keypoint detector objects
    #SIFT is not available anymore in openCV    
# Create List where all the descriptors will be stored
des_list = []

#BRISK is a good replacement to SIFT. ORB also works but didn;t work well for this example
brisk = cv2.BRISK_create(30)

for image_path in image_paths:
    im = cv2.imread(image_path)
    kpts, des = brisk.detectAndCompute(im, None)
    des_list.append((image_path, des)) 

   
    


In [6]:
print(des_list[1:])

[('images/cell_images/train\\Parasitized\\C37BP2_thinF_IMG_20150620_133111a_cell_88.png', array([[240, 255, 239, 243, 225,   0,   0,   0,   0,   0,  28, 255, 241,
        231,  31, 119, 156, 113, 228,  63,  12, 195,   0,   0,   0,   0,
          0,   0,   0, 100,  60, 227, 153,  39,  20,   0,   4,  16, 238,
        123, 198,  16,   3,   0,   0,   0,   0,   0,   0, 229, 114, 127,
          9,   0,   0, 200,  38, 219, 237,  37,   0,   0,   0,   0],
       [240, 255, 231, 255, 255, 192,   0,   0,   0, 134,  31, 255, 255,
        195,  14,  50, 200,  96, 135, 191, 252, 195,  48,  12,   0,   0,
          0, 128,  25, 103,  60, 251, 200,   1,   0,  64,   0,  99, 140,
        115, 198,  16,   3,   0,   0,   0,   0,   0,   0, 228,  50,  61,
         12, 203, 199,  68,  96, 195, 250,  37,  16,   0,   0,   0],
       [176, 255, 239, 251, 225,   0,   0,   0,   0,   0,  28, 239, 241,
        199,  31, 119, 156, 113, 215,  63,  12, 195,   0,   0,   0,   0,
          0,   0,   0, 108, 124, 227, 217,

In [40]:
# Stack all the descriptors vertically in a numpy array
descriptors = des_list[0][1]
for image_path, descriptor in des_list[1:]:
    descriptors = np.vstack((descriptors, descriptor))  

#kmeans works only on float, so convert integers to float
descriptors_float = descriptors.astype(float)  

# Perform k-means clustering and vector quantization
from scipy.cluster.vq import kmeans, vq

k = 200  #k means with 100 clusters gives lower accuracy for the aeroplane example
voc, variance = kmeans(descriptors_float, k, 1) 

# Calculate the histogram of features and represent them as vector
#vq Assigns codes from a code book to observations.
im_features = np.zeros((len(image_paths), k), "float32")
for i in range(len(image_paths)):
    words, distance = vq(des_list[i][1],voc)
    for w in words:
        im_features[i][w] += 1

# Perform Tf-Idf vectorization
nbr_occurences = np.sum( (im_features > 0) * 1, axis = 0)
idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
#Standardize features by removing the mean and scaling to unit variance
#In a way normalization
from sklearn.preprocessing import StandardScaler
stdSlr = StandardScaler().fit(im_features)
im_features = stdSlr.transform(im_features)

#Train an algorithm to discriminate vectors corresponding to positive and negative training images
# Train the Linear SVM
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=10000)  #Default of 100 is not converging
clf.fit(im_features, np.array(image_classes))

#Train Random forest to compare how it does against SVM
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators = 100, random_state=30)
clf2.fit(im_features, np.array(image_classes))




RandomForestClassifier(random_state=30)

In [41]:
test_path = 'images/cell_images/test'  # Folder Names are Parasitized and Uninfected
test_names = os.listdir(train_path)


test_image_paths = []
test_image_classes = []
test_class_id = 0

for test_name in test_names:
    dir = os.path.join(test_path, test_name)
    test_class_path = imglist(dir)
    test_image_paths+=test_class_path
    test_image_classes+=[test_class_id]*len(test_class_path)
    test_class_id+=1

In [42]:
print(test_image_classes)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [43]:
# Create feature extraction and keypoint detector objects
    #SIFT is not available anymore in openCV    
# Create List where all the descriptors will be stored
test_des_list = []

for test_image_path in test_image_paths:
    im = cv2.imread(test_image_path)
    kpts, des = brisk.detectAndCompute(im, None)
    test_des_list.append((test_image_path, des)) 

# Stack all the descriptors vertically in a numpy array
test_descriptors = test_des_list[0][1]
for test_image_path, test_descriptor in test_des_list[1:]:
    test_descriptors = np.vstack((test_descriptors, test_descriptor))  

#kmeans works only on float, so convert integers to float
test_descriptors_float = test_descriptors.astype(float)  




In [44]:
t_k = 200  #k means with 100 clusters gives lower accuracy for the aeroplane example
t_voc, t_variance = kmeans(test_descriptors_float, t_k, 1) 

test_im_features = np.zeros((len(test_image_paths), t_k), "float32")
for i in range(len(test_image_paths)):
    words, distance = vq(test_des_list[i][1],t_voc)
    for w in words:
        test_im_features[i][w] += 1

test_nbr_occurences = np.sum( (test_im_features > 0) * 1, axis = 0)
test_idf = np.array(np.log((1.0*len(test_image_paths)+1) / (1.0*test_nbr_occurences + 1)), 'float32')

test_stdSlr = StandardScaler().fit(test_im_features)
test_im_features = test_stdSlr.transform(test_im_features)


In [45]:
predictions = clf.predict(test_im_features)
predictions2 = clf2.predict(test_im_features)
print(predictions)
from sklearn.metrics import accuracy_score
print("LinearSVD:",accuracy_score(test_image_classes, predictions),end='\n')
print("RandomForest:",accuracy_score(test_image_classes, predictions2),end='\n')

[0 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 0 1 0 1]
LinearSVD: 0.7
RandomForest: 0.65


In [47]:
predictions_r = clf.predict(im_features)
predictions_r2 = clf2.predict(im_features)
from sklearn.metrics import accuracy_score
print("LinearSVDt:",accuracy_score(image_classes, predictions_r),end='\n')
print("RandomForestt:",accuracy_score(image_classes, predictions_r2),end='\n')

LinearSVDt: 1.0
RandomForestt: 1.0


In [32]:
print(test_image_classes)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# Save the SVM
#Joblib dumps Python object into one file
import joblib
joblib.dump((clf, training_names, stdSlr, k, voc), "bovw.pkl", compress=3)   