## 1.Import libraries
The required libraries for this notebook are pandas, sklearn, numpy and matplotlib.

In [125]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
from skimage.color import rgb2gray
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics

## 2.1 Edit label.csv data
Add column for binary label: 0 for no tumor, 1 for tumor

In [126]:
df = pd.read_csv('./dataset/label.csv')
df.head()

Unnamed: 0,file_name,label
0,IMAGE_0000.jpg,meningioma_tumor
1,IMAGE_0001.jpg,no_tumor
2,IMAGE_0002.jpg,meningioma_tumor
3,IMAGE_0003.jpg,glioma_tumor
4,IMAGE_0004.jpg,meningioma_tumor


In [127]:
if os.path.exists('./dataset/label_edited.pkl'):
    print('The pickle file with binary label already exists')
else:
    binary = []
    for label in df['label']:
        if 'no_tumor' in label:
            binary.append(0)
        else:
            binary.append(1)

    df['binary_label'] = binary

The pickle file with binary label already exists


## 2.2 Add image data
Add image data in the form of an array to the label data and save it in a new pickle file so it does not need to run everytime

In [128]:
if os.path.exists('./dataset/label_edited.pkl'):
    print('The pickle file with image data already exists')
else:
    data_path = './dataset/image'
    # create an empty column to store image data
    # df['data'] = df['data'].astype(object)
    hog_features = []
    for filename in df['file_name']:
        # use imread to load image from specified file name
        im = imread(os.path.join(data_path, filename))
        # use resize the image to a 128 by 64 pixel image
        resized_im = resize(im, (128*4, 64*4))
        # Extract Histogram of Oriented Gradients (HOG) for the image
        fd = hog(resized_im, orientations = 9, pixels_per_cell = (8, 8),
                            cells_per_block = (2, 2), visualize = False, block_norm='L2-Hys')
        # find index/row number of image file name
        # idx = df.loc[df['file_name'] == filename].index[0]
        # save HOG information in 'data' column
        # df.loc[idx, 'data'] = [fd] 
        # this resulted in an error: Must have equal len keys and value when setting with an ndarray
        hog_features.append(fd)
        
    df['data'] = hog_features
    df.to_pickle("./dataset/label_edited.pkl")

The pickle file with image data already exists


In [129]:
df_edited = pd.read_pickle("./dataset/label_edited.pkl")
# df_edited.head()

In [130]:
# # use np.unique to get all unique values in the list of labels
# labels = np.unique(df_edited['label'])

# # set up the matplotlib figure and axes, based on the number of labels
# fig, axes = plt.subplots(1, len(labels))
# fig.set_size_inches(15,4)
# fig.tight_layout()
 
# # make a plot for every label (equipment) type. The index method returns the 
# # index of the first item corresponding to its search string, label in this case
# for ax, label in zip(axes, labels):
#     idx = df_edited.loc[df_edited['label'] == label].index[0]
#     ax.imshow(df_edited.loc[idx, 'data'])
#     ax.axis('off')
#     ax.set_title(label)

In [131]:
X = df_edited['data']
Y = df_edited['binary_label']

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 3) 
#test_size= should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split
#everytime you run it without specifying random_state, you will get a different result, this is expected behavior
#print (len(X_test), len(y_test))

print('train set: {}  | test set: {}'.format(round(((len(y_train)*1.0)/len(X)),3),
                                                       round((len(y_test)*1.0)/len(X),3)))

train set: 0.7  | test set: 0.3


In [134]:
X_train

2543    [0.0, 0.0, 0.024435171795246135, 0.10347832772...
1046    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
287     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1317    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2311    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                              ...                        
2304    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
968     [0.0, 0.0, 0.0, 0.0, 0.10785785778655294, 0.0,...
1667    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1688    [0.7071067807446058, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1898    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: data, Length: 2100, dtype: object

In [133]:
# def KNNClassifier(X_train, y_train, X_test,k):

#     #Create KNN object with a K coefficient
#     neigh = KNeighborsClassifier(n_neighbors=k)
#     neigh.fit(X_train, y_train) # Fit KNN model


#     Y_pred = neigh.predict(X_test)
#     return Y_pred

# Y_pred = KNNClassifier(X_train, y_train, X_test, 4)
clf = svm.SVC()
clf.fit(X_train, y_train)
Y_pred = clf.predict(X_test)
# throws error: setting an array element with a sequence
# when getting X from dataframe, the resulting array is dtype object,
# but when getting X directly from hog_features, resulting array is dtype float??
score = metrics.accuracy_score(y_test, Y_pred)
print(score)

ValueError: setting an array element with a sequence.