# Import Libraries: 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report, accuracy_score
from sklearn import svm
from skimage.io import imread, imshow
from keras.preprocessing import image

import os 

print("All imports done")

Using TensorFlow backend.


All imports done


# Define Directories

In [2]:
# First we set the path to find the images

basedir = './dataset'
images_dir = os.path.join(basedir,'image')
labels_filename = 'label.csv'

# Data preprocessing

In [3]:
# This function will loop through the csv file containing the labels and create a dictionnary containing the 
# name of the image as a key and the associate label as a value
def binary_labelling(): 
    labels_file = open(os.path.join(basedir, labels_filename), 'r')
    lines = labels_file.readlines()
    tumor_labels = {line.split(',')[0] : (line.split(',')[1].strip()) for line in lines[1:]}

    for i in tumor_labels: 
        if tumor_labels[i] == 'no_tumor': 
            tumor_labels[i] = 0
        else:
            tumor_labels[i] = 1
    
    return(tumor_labels)


In [4]:
# This function will loop through the images and using the name of the image and the dictionnary set in the previous 
#function, will return two array containing the features and the label

def extract_features():
    all_features = []
    all_labels = []
    
    labels = binary_labelling()

    image_paths = [os.path.join(images_dir, l) for l in os.listdir(images_dir)]
    print(images_dir)
    counter = 0
    if os.path.isdir(images_dir):
        all_features = []
        all_labels = []
        for img_path in image_paths:
            filename = img_path.split('/')[-1]
            img = imread(img_path, as_gray=True)
            features = np.reshape(img, (512*512)) # Here the features are set to an array of dimension (512*512) to decrease the complexity
            features.shape, features
            all_features.append(features)
            all_labels.append(labels[filename])
    np_features = np.array(all_features)
    np_labels = np.array(all_labels)
    return np_features, np_labels
        


# Feature extraction

In [5]:
# This function is used to separate the dataset into a training and testing set with proportion 80% - 20% and returns
# the associated training features, training labels, validation features and validation labels
def get_data(): 

    X, y = extract_features()

    Y = np.array([y, -(y - 1)]).T
    tr_X = X[:2400]
    tr_Y = Y[:2400]
    te_X = X[2400:]
    te_Y = Y[2400:]
    
    return tr_X, tr_Y, te_X, te_Y




# Running Classifier

In [6]:
#Assign features and labels to the corresponding variables
tr_X, tr_Y, te_X, te_Y= get_data()

#Create the classifier
classifier = svm.SVC(kernel='linear')

#Train the classifier on the training values
classifier.fit(tr_X, list(zip(*tr_Y))[0]) 

#Test the classifier on the validation set
pred = classifier.predict(te_X)


#Observe the predicted values (This is useful for troubleshooting)
print(pred)

print("Accuracy:", accuracy_score(list(zip(*te_Y))[0], pred))

{'IMAGE_0000.jpg': 1, 'IMAGE_0001.jpg': 0, 'IMAGE_0002.jpg': 1, 'IMAGE_0003.jpg': 1, 'IMAGE_0004.jpg': 1, 'IMAGE_0005.jpg': 1, 'IMAGE_0006.jpg': 0, 'IMAGE_0007.jpg': 1, 'IMAGE_0008.jpg': 1, 'IMAGE_0009.jpg': 1, 'IMAGE_0010.jpg': 1, 'IMAGE_0011.jpg': 1, 'IMAGE_0012.jpg': 1, 'IMAGE_0013.jpg': 1, 'IMAGE_0014.jpg': 1, 'IMAGE_0015.jpg': 1, 'IMAGE_0016.jpg': 1, 'IMAGE_0017.jpg': 1, 'IMAGE_0018.jpg': 1, 'IMAGE_0019.jpg': 0, 'IMAGE_0020.jpg': 0, 'IMAGE_0021.jpg': 1, 'IMAGE_0022.jpg': 1, 'IMAGE_0023.jpg': 1, 'IMAGE_0024.jpg': 1, 'IMAGE_0025.jpg': 1, 'IMAGE_0026.jpg': 1, 'IMAGE_0027.jpg': 0, 'IMAGE_0028.jpg': 1, 'IMAGE_0029.jpg': 1, 'IMAGE_0030.jpg': 1, 'IMAGE_0031.jpg': 1, 'IMAGE_0032.jpg': 1, 'IMAGE_0033.jpg': 1, 'IMAGE_0034.jpg': 1, 'IMAGE_0035.jpg': 1, 'IMAGE_0036.jpg': 0, 'IMAGE_0037.jpg': 1, 'IMAGE_0038.jpg': 1, 'IMAGE_0039.jpg': 1, 'IMAGE_0040.jpg': 1, 'IMAGE_0041.jpg': 1, 'IMAGE_0042.jpg': 0, 'IMAGE_0043.jpg': 1, 'IMAGE_0044.jpg': 1, 'IMAGE_0045.jpg': 1, 'IMAGE_0046.jpg': 1, 'IMAGE_0047.

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 0 

In [7]:
# Obtain the classification report on validation set

rounded_labels = list(zip(*te_Y))[0]
cm = classification_report(rounded_labels, pred, target_names = ["no tumour", "tumour"])
print(cm)


              precision    recall  f1-score   support

   no tumour       0.83      0.66      0.73        79
      tumour       0.95      0.98      0.96       521

    accuracy                           0.94       600
   macro avg       0.89      0.82      0.85       600
weighted avg       0.93      0.94      0.93       600



## Testing on test set


### Preprocessing of test set


In [8]:
#We recreate the function used earlier but specific to testing data

testdir = './test'
test_images_dir = os.path.join(testdir,'image')
test_labels_filename = 'label.csv'

def binary_labelling_testset(): 
    labels_file = open(os.path.join(testdir, test_labels_filename), 'r')
    lines = labels_file.readlines()
    tumor_labels = {line.split(',')[0] : (line.split(',')[1].strip()) for line in lines[1:]}

    for i in tumor_labels: 
        if tumor_labels[i] == 'no_tumor': 
            tumor_labels[i] = 0
        else:
            tumor_labels[i] = 1    
    return(tumor_labels)

In [9]:
def extract_features_testset():
    all_features = []
    all_labels = []
    
    labels = binary_labelling_testset()

    image_paths = [os.path.join(test_images_dir, l) for l in os.listdir(test_images_dir)]
    if os.path.isdir(images_dir):
        all_features = []
        all_labels = []
        for img_path in image_paths:
            filename = img_path.split('/')[-1]
            img = imread(img_path, as_gray=True)
            features = np.reshape(img, (512*512))
            features.shape, features
            all_features.append(features)
            all_labels.append(labels[filename])
    np_features = np.array(all_features)
    np_labels = np.array(all_labels)
    return np_features, np_labels

### Testing 

In [10]:
# Feature extraction from testing data, note that we do not need to split this dataset
testing_x, testing_y = extract_features_testset()
testing_Y = np.array([testing_y, -(testing_y - 1)]).T


#AS the classifier is already trained, we predict using the testing dataset

pred_testing = classifier.predict(testing_x)

print("Accuracy:", accuracy_score(list(zip(*testing_Y))[0], pred_testing))

rounded_testing_labels = list(zip(*testing_Y))[0]
c_report_testing = classification_report(rounded_testing_labels, pred_testing, target_names = ["no tumour", "tumour"])
print(c_report_testing)

Accuracy: 0.91
              precision    recall  f1-score   support

   no tumour       0.81      0.68      0.74        37
      tumour       0.93      0.96      0.95       163

    accuracy                           0.91       200
   macro avg       0.87      0.82      0.84       200
weighted avg       0.91      0.91      0.91       200

