# SVM Model for Tumour Classification
## A. Binary Task
### Build a classifier to identify whether there is a tumor in the MRI images.

Import necessary libraries, matplotlib, pandas, numpy, sklearn

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report,accuracy_score
from sklearn.decomposition import PCA
#from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import pandas as pd

Read label.csv; Define image data folder path

In [2]:
csv_array = pd.read_csv('./dataset/label.csv')

dir_image = "./dataset/image/"

PCA dimension reduction step 1: 
Find the number of principle components needed to keep all image data variance more than 99% after dimension reduction. Result is 102 dimension needed.

In [3]:
rows = 3000                           #rows is number of images
     
pca_targets = np.zeros(rows)          #pca_targets is used to store number of principle components needed for each image
print(pca_targets.shape)

# read in every image; 
# reduce it from RGB to grey scale (reduce data size by removing unnecessary data);
# for each image, use PCA to find the number of principle components needed to reserve
#     more than 99% data variance; The largest number found among all images is the 
#     final number of principle components needed (for all images to keep 99% data 
#     variance after dimension reduction).
for i in range(rows):  
    #Read in each image
    img = mpimg.imread(dir_image + csv_array.file_name[i])[...,0]
    
    #Implement PCA with full features of image (512)
    pca_512 = PCA(n_components = 512, random_state = 2020) 
    pca_512.fit(img)
    
    #Get the cumulative explained_variance_ratio_
    a = np.cumsum(pca_512.explained_variance_ratio_ * 100)
    
    #Set necessary explained_variance_ratio_ to 99%
    #Get number of principle components needed to obtain expected explained_variance_ratio_
    threshold = 99
    matched_index = np.where(a > threshold)[0][0]
    pca_targets[i] = matched_index
    
    if i%500 == 0:     #print index to indicate processing progress
        print(i)
        
#Find the largest dimention number to keep all images' explained_variance_ratio_ more than 99%
pca_num = np.amax(pca_targets)
print("pca_num: ", pca_num)

#This part of code is to plot 'number of priciple components' against 'Explained variance'.
#It is needed only when you need to visuallize intermediate PCA result.
#The code need to be put at proper place before execution.
    #print("Variance explained by all 512 principal components =", sum(pca_512.explained_variance_ratio_ * 100))
    #plt.plot(np.cumsum(pca_512.explained_variance_ratio_ * 100))
    #plt.xlabel('number of priciple components')
    #plt.ylabel('Explained variance')
    #plt.savefig('BrainTumorPCA512.png',dpi = 100)

(3000,)
0
500
1000
1500
2000
2500
pca_num:  102.0


Reduce dimension for 3000 images from 512-512 to 512-102 dimension; After dimension recution, read into "imgs" array; each reduced image is flattened into one row and also reduced from RGB to grey scale; 

In [5]:
imgs = np.zeros((rows,512*pca_num.astype(np.int64)))          #Initialize an array for all reduced images

for i in range(rows):                
    img = mpimg.imread(dir_image + csv_array.file_name[i])[...,0]
    
    pca_102 = PCA(n_components = 102, random_state = 2020) 
    pca_102.fit(img)
    imgs_pca_102 = pca_102.transform(img)
    
    imgs[i] = imgs_pca_102.flatten()
    if i%500 == 0:     #print index to indicate processing progress
        print(i)

0
500
1000
1500
2000
2500


Modify csv_array.label to adapt to binary clasification (only "no_tumor" and "has_tumor" are valid labels)

In [6]:
for i in range(rows):
    if csv_array.label[i] != "no_tumor" :
        csv_array.label[i] = "has_tumor" #only no_tumor and has_tumor are valid labels for binary clasifcation task

Split train/test data set by 9:1

In [7]:
x_train, x_test, y_train, y_test = train_test_split(imgs, csv_array.label[0:rows], test_size=0.10, random_state=0)

Tune hyperparameter C, kernek and gamma for SVM model to get better classification accuracy

In [28]:
param_grid = [
  {'C': [1, 5, 10, 20, 40], 'kernel': ['rbf']},
  {'C': [1, 5, 10, 20, 40], 'kernel': ['linear']},
 ]
svc = SVC()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

clf = GridSearchCV(svc, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

result = clf.fit(x_train[0:500], y_train[0:500])

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.8726666666666666
Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}


Use tuned parameters to build new model;
Train model and predict on test data set;
Print accuracy score;

In [29]:
svc_tuned = SVC(kernel='rbf', C=10)
svc_tuned.fit(x_train, y_train)
y_pred = svc_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

0.9333333333333333

In [None]:
#######################################################################################
##
## This part is not used. It is another way to tune hyperparameter (RandomizedSearchCV)
##
#######################################################################################

#param_distributions = {"C": uniform(1, 10)} #Random tune C between 1 and 100
#model = SVC(kernel='rbf', gamma='scale')
#clf = RandomizedSearchCV(model,param_distributions, n_iter=10, random_state=0)
#search = clf.fit(x_train[0:300], y_train[0:300]) 
#search.best_estimator_   #SVC(C=5.236547993389047)

##Train SVM model (best_estimator_) and predict based on test data set. Print accuracy score.

#search.best_estimator_.fit(x_train, y_train)
#y_pred = search.best_estimator_.predict(x_test)
#accuracy_score(y_test, y_pred) #0.9333333333333333