# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,accuracy_score
from skimage.io import imread, imshow
from sklearn.linear_model import LogisticRegression

import os 

print("All imports done")

Using TensorFlow backend.


All imports done


## Define Directories

In [2]:
basedir = './dataset'
images_dir = os.path.join(basedir,'image')
labels_filename = 'label.csv'

## Data Preprocessing

In [3]:
def binary_labelling(): 
    labels_file = open(os.path.join(basedir, labels_filename), 'r')
    lines = labels_file.readlines()
    tumor_labels = {line.split(',')[0] : (line.split(',')[1].strip()) for line in lines[1:]}

    for i in tumor_labels: 
        if tumor_labels[i] == 'no_tumor': 
            tumor_labels[i] = 0
        else:
            tumor_labels[i] = 1    
    return(tumor_labels)


In [4]:
def extract_features():
    all_features = []
    all_labels = []
    
    labels = binary_labelling()

    image_paths = [os.path.join(images_dir, l) for l in os.listdir(images_dir)]
    print(images_dir)
    counter = 0
    if os.path.isdir(images_dir):
        all_features = []
        all_labels = []
        for img_path in image_paths:
            filename = img_path.split('/')[-1]
            img = imread(img_path, as_gray=True)
            features = np.reshape(img, (512*512))
            features.shape, features
            all_features.append(features)
            all_labels.append(labels[filename])
    np_features = np.array(all_features)
    np_labels = np.array(all_labels)
    return np_features, np_labels
        


## Feature extraction

In [5]:
def get_data(): 

    X, y = extract_features()

    Y = np.array([y, -(y - 1)]).T
    tr_X = X[:2400]
    tr_Y = Y[:2400]
    te_X = X[2400:]
    te_Y = Y[2400:]
    
    return tr_X, tr_Y, te_X, te_Y

## Classifier Implementation

In [7]:
tr_X, tr_Y, te_X, te_Y = get_data()

classifier = LogisticRegression(C = 0.1, solver='lbfgs')
classifier.fit(tr_X, list(zip(*tr_Y))[0])


{'IMAGE_0000.jpg': 1, 'IMAGE_0001.jpg': 0, 'IMAGE_0002.jpg': 1, 'IMAGE_0003.jpg': 1, 'IMAGE_0004.jpg': 1, 'IMAGE_0005.jpg': 1, 'IMAGE_0006.jpg': 0, 'IMAGE_0007.jpg': 1, 'IMAGE_0008.jpg': 1, 'IMAGE_0009.jpg': 1, 'IMAGE_0010.jpg': 1, 'IMAGE_0011.jpg': 1, 'IMAGE_0012.jpg': 1, 'IMAGE_0013.jpg': 1, 'IMAGE_0014.jpg': 1, 'IMAGE_0015.jpg': 1, 'IMAGE_0016.jpg': 1, 'IMAGE_0017.jpg': 1, 'IMAGE_0018.jpg': 1, 'IMAGE_0019.jpg': 0, 'IMAGE_0020.jpg': 0, 'IMAGE_0021.jpg': 1, 'IMAGE_0022.jpg': 1, 'IMAGE_0023.jpg': 1, 'IMAGE_0024.jpg': 1, 'IMAGE_0025.jpg': 1, 'IMAGE_0026.jpg': 1, 'IMAGE_0027.jpg': 0, 'IMAGE_0028.jpg': 1, 'IMAGE_0029.jpg': 1, 'IMAGE_0030.jpg': 1, 'IMAGE_0031.jpg': 1, 'IMAGE_0032.jpg': 1, 'IMAGE_0033.jpg': 1, 'IMAGE_0034.jpg': 1, 'IMAGE_0035.jpg': 1, 'IMAGE_0036.jpg': 0, 'IMAGE_0037.jpg': 1, 'IMAGE_0038.jpg': 1, 'IMAGE_0039.jpg': 1, 'IMAGE_0040.jpg': 1, 'IMAGE_0041.jpg': 1, 'IMAGE_0042.jpg': 0, 'IMAGE_0043.jpg': 1, 'IMAGE_0044.jpg': 1, 'IMAGE_0045.jpg': 1, 'IMAGE_0046.jpg': 1, 'IMAGE_0047.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=0.1)

In [8]:
pred = classifier.predict(te_X)
print("Accuracy:", accuracy_score(list(zip(*te_Y))[0], pred))

Accuracy: 0.935


In [9]:
rounded_labels = list(zip(*te_Y))[0]
cm = classification_report(rounded_labels, pred, target_names = ["no tumour", "tumour"])
print(cm)

              precision    recall  f1-score   support

   no tumour       0.86      0.61      0.71        79
      tumour       0.94      0.98      0.96       521

    accuracy                           0.94       600
   macro avg       0.90      0.80      0.84       600
weighted avg       0.93      0.94      0.93       600



## Testing on Test Set

### Pre-processing of testing data


In [10]:
testdir = './test'
test_images_dir = os.path.join(testdir,'image')
test_labels_filename = 'label.csv'

def binary_labelling_testset(): 
    labels_file = open(os.path.join(testdir, test_labels_filename), 'r')
    lines = labels_file.readlines()
    tumor_labels = {line.split(',')[0] : (line.split(',')[1].strip()) for line in lines[1:]}

    for i in tumor_labels: 
        if tumor_labels[i] == 'no_tumor': 
            tumor_labels[i] = 0
        else:
            tumor_labels[i] = 1    
    return(tumor_labels)

In [11]:
def extract_features_testset():
    all_features = []
    all_labels = []
    
    labels = binary_labelling_testset()

    image_paths = [os.path.join(test_images_dir, l) for l in os.listdir(test_images_dir)]
    if os.path.isdir(images_dir):
        all_features = []
        all_labels = []
        for img_path in image_paths:
            filename = img_path.split('/')[-1]
            img = imread(img_path, as_gray=True)
            features = np.reshape(img, (512*512))
            features.shape, features
            all_features.append(features)
            all_labels.append(labels[filename])
    np_features = np.array(all_features)
    np_labels = np.array(all_labels)
    return np_features, np_labels

### Testing

In [12]:
testing_x, testing_y = extract_features_testset()
testing_Y = np.array([testing_y, -(testing_y - 1)]).T

pred_testing = classifier.predict(testing_x)

print("Accuracy:", accuracy_score(list(zip(*testing_Y))[0], pred_testing))

rounded_testing_labels = list(zip(*testing_Y))[0]

c_report_testing = classification_report(rounded_testing_labels, pred_testing, target_names = ["no tumour", "tumour"])
print(c_report_testing)

Accuracy: 0.91
              precision    recall  f1-score   support

   no tumour       0.91      0.57      0.70        37
      tumour       0.91      0.99      0.95       163

    accuracy                           0.91       200
   macro avg       0.91      0.78      0.82       200
weighted avg       0.91      0.91      0.90       200

