In [4]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFilter
from skimage.transform import downscale_local_mean
from skimage import color
import json
import csv

Annotating a Sample Image

In [3]:
epidural_csv_file = 'segmentations/Results_Epidural Hemorrhage Detection_2020-11-16_21.31.26.148.csv'
epidural_image_repo = 'epidural/max_contrast_window'
epidural_image_file = 'ID_004c4b319.jpg'

In [4]:
def get_coords(image, image_csv_file, image_file_name):
    labels = get_labels_csv(image_csv_file, image_file_name)
    coord_list = []
    for label in labels:
        coord = (label['x'] * image.size[0], label['y'] * image.size[1])
        coord_list.append(coord)
    return coord_list
def get_labels_csv(image_csv_file, image_file_name):
    with open(image_csv_file) as csv_file:
        data_reader = csv.reader(csv_file, dialect='excel')
        for row in data_reader:
            if row[1] == image_file_name:
                return json.loads(row[7])
    return None

In [5]:
im = Image.open(f'{epidural_image_repo}/{epidural_image_file}')
features = []
for x in range(im.size[0]):
    for y in range(20):
        features.append(im.getpixel((x, y))[0])
        features.append(im.getpixel((x, y))[1])
        features.append(im.getpixel((x, y))[2])
draw = ImageDraw.Draw(im)
draw.line(get_coords(im, epidural_csv_file, epidural_image_file), fill='red')
im.show()

Reading an Image and Its Classification

In [5]:
num_examples = 10000
num_features = 16384
standard_height, standard_width = 512, 512

In [6]:
def get_malformed_images():
    images = []
    directories = ['subdural/max_contrast_window', 
                   'multi/max_contrast_window', 
                   'epidural/max_contrast_window', 
                   'intraparenchymal/max_contrast_window', 
                   'intraventricular/max_contrast_window', 
                   'subarachnoid/max_contrast_window']
    for directory in directories:
        for filename in os.listdir(directory):
            im = Image.open(f'{directory}/{filename}')
            width, height = im.size
            if width != standard_width and height != standard_height:
                images.append(filename[:-4])
    return images

In [7]:
malformed_images = get_malformed_images() 
print(len(malformed_images))

86


In [8]:
all_labels = pd.read_csv('segmentations/hemorrhage-labels.csv')

In [9]:
labels = all_labels[all_labels['any'] == 1]

In [10]:
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
14,ID_0002081b6,1,0,1,0,0,0
24,ID_0002a38ad,1,0,0,0,1,1
33,ID_000346ce2,1,0,0,0,0,1
36,ID_00042829c,1,0,0,1,0,0
43,ID_0004a5701,1,0,0,0,0,1
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [11]:
for image_name in malformed_images:
    labels = labels[labels.Image != image_name]
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
14,ID_0002081b6,1,0,1,0,0,0
24,ID_0002a38ad,1,0,0,0,1,1
33,ID_000346ce2,1,0,0,0,0,1
36,ID_00042829c,1,0,0,1,0,0
43,ID_0004a5701,1,0,0,0,0,1
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [12]:
labels = labels.iloc[:num_examples].reset_index(drop=True)

In [13]:
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_0002081b6,1,0,1,0,0,0
1,ID_0002a38ad,1,0,0,0,1,1
2,ID_000346ce2,1,0,0,0,0,1
3,ID_00042829c,1,0,0,1,0,0
4,ID_0004a5701,1,0,0,0,0,1
...,...,...,...,...,...,...,...
9995,ID_179d2ca85,1,0,0,1,0,0
9996,ID_179d7380f,1,0,0,0,0,1
9997,ID_179d9a649,1,0,0,1,0,0
9998,ID_179e1a9b9,1,0,0,0,0,1


In [14]:
def get_grayscale_image_data():
    return labels.apply(lambda row: get_data(row), axis=1, result_type='expand')
def get_grayscale_data(row):
    if row[2] + row[3] + row[4] + row[5] + row[6] > 1:
        return color.rgb2gray(plt.imread(f'multi/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[2] == 1:
        return color.rgb2gray(plt.imread(f'epidural/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[3] == 1:
        return color.rgb2gray(plt.imread(f'intraparenchymal/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[4] == 1:
        return color.rgb2gray(plt.imread(f'intraventricular/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[5] == 1:
        return color.rgb2gray(plt.imread(f'subarachnoid/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[6] == 1:
        return color.rgb2gray(plt.imread(f'subdural/max_contrast_window/{row[0]}.jpg')).flatten()
    return []
def get_downsampled_images():
    return labels.apply(lambda row: get_downsampled_data(row), axis=1, result_type='expand')
def get_downsampled_data(row):
    if row[2] + row[3] + row[4] + row[5] + row[6] > 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'multi/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[2] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'epidural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[3] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'intraparenchymal/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[4] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'intraventricular/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[5] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'subarachnoid/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[6] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'subdural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    else:
        temp = np.zeros(num_features)
    if temp.size != num_features:
        temp = np.zeros(num_features)
    return temp

In [15]:
image_data = get_downsampled_images()

In [16]:
image_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,0.108220,0.095720,0.082729,0.103808,0.081504,0.105524,0.080524,0.071945,0.085139,0.091757,...,0.270148,0.273564,0.278281,0.281942,0.282239,0.286479,0.284163,0.284163,0.285144,0.285144
1,0.105278,0.096455,0.091553,0.102092,0.113122,0.080033,0.085425,0.084690,0.097680,0.069249,...,0.247717,0.272717,0.263799,0.283897,0.298848,0.293456,0.314991,0.304206,0.321118,0.300775
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9997,0.055780,0.072550,0.075930,0.055780,0.050825,0.056025,0.064759,0.075789,0.045037,0.055086,...,0.591351,0.591351,0.583491,0.583491,0.583981,0.573687,0.566579,0.556775,0.538638,0.518540
9998,0.104053,0.106014,0.097680,0.075376,0.092043,0.089347,0.113612,0.108465,0.114102,0.098416,...,0.166310,0.167152,0.189595,0.182504,0.184447,0.190312,0.187236,0.194030,0.192138,0.192138


In [17]:
column_list = [i for i in range(num_features)]

In [18]:
total_data = pd.concat([labels, image_data], axis=1)

In [20]:
total_data = total_data[(total_data[column_list].sum(axis=1) != 0)]

In [21]:
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_0002081b6,1,0,1,0,0,0,0.108220,0.095720,0.082729,...,0.270148,0.273564,0.278281,0.281942,0.282239,0.286479,0.284163,0.284163,0.285144,0.285144
1,ID_0002a38ad,1,0,0,0,1,1,0.105278,0.096455,0.091553,...,0.247717,0.272717,0.263799,0.283897,0.298848,0.293456,0.314991,0.304206,0.321118,0.300775
2,ID_000346ce2,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00042829c,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ID_0004a5701,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ID_179d2ca85,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9996,ID_179d7380f,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9997,ID_179d9a649,1,0,0,1,0,0,0.055780,0.072550,0.075930,...,0.591351,0.591351,0.583491,0.583491,0.583981,0.573687,0.566579,0.556775,0.538638,0.518540
9998,ID_179e1a9b9,1,0,0,0,0,1,0.104053,0.106014,0.097680,...,0.166310,0.167152,0.189595,0.182504,0.184447,0.190312,0.187236,0.194030,0.192138,0.192138


In [227]:
# legacy function: loads data for a particular classification into a DataFrame
#def get_images(classification, max_size=1000000): 
#    classification_labels = labels[(labels[classification] == 1) & (labels[columns].sum(axis=1) == 1)]
#    data = np.zeros((classification_labels.shape[0], num_features))
#    for index, value in enumerate(classification_labels['Image'].items()):
#        data[index] = plt.imread(f'{classification}/max_contrast_window/{value[1]}.jpg').flatten()
#        if index == max_size:
#            break
#    return data
#epidural = get_images('epidural', 1000)
#intraparenchymal = get_images('intraparenchymal', 1000)
#intraventricular = get_images('intraventricular', 1000)
#subarachnoid = get_images('subarachnoid', 1000)
#subdural = get_images('subdural', 1000)

In [228]:
# legacy function: loads data for a the "multi" classification into a DataFrame
#def get_multi_class_images(max_size=1000000):
#    classification_labels = labels[(labels[columns].sum(axis=1) > 1)]
#    data = np.zeros((classification_labels.shape[0], num_features))
#    for index, value in enumerate(classification_labels['Image'].items()):
#        data[index] = plt.imread(f'multi/max_contrast_window/{value[1]}.jpg').flatten()
#        if index == max_size:
#            break
#    return data
#multi = get_multi_class_images(1000)

In [229]:
# the line of code below loads the 'all labels' field in the csv file in json
# coords_lists = json.loads(row[4].replace("'[", "[").replace("]'", "],").replace(',]', ']'))

Logistic Regression Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [23]:
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_0002081b6,1,0,1,0,0,0,0.108220,0.095720,0.082729,...,0.270148,0.273564,0.278281,0.281942,0.282239,0.286479,0.284163,0.284163,0.285144,0.285144
1,ID_0002a38ad,1,0,0,0,1,1,0.105278,0.096455,0.091553,...,0.247717,0.272717,0.263799,0.283897,0.298848,0.293456,0.314991,0.304206,0.321118,0.300775
2,ID_000346ce2,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00042829c,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ID_0004a5701,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ID_179d2ca85,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9996,ID_179d7380f,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9997,ID_179d9a649,1,0,0,1,0,0,0.055780,0.072550,0.075930,...,0.591351,0.591351,0.583491,0.583491,0.583981,0.573687,0.566579,0.556775,0.538638,0.518540
9998,ID_179e1a9b9,1,0,0,0,0,1,0.104053,0.106014,0.097680,...,0.166310,0.167152,0.189595,0.182504,0.184447,0.190312,0.187236,0.194030,0.192138,0.192138


In [24]:
X_train, X_test, y_train, y_test = train_test_split(total_data[column_list], total_data['epidural'], test_size=0.2, random_state=0)

In [25]:
epidural_logistic_model = LogisticRegression()
epidural_logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [28]:
y_predict = epidural_logistic_model.predict(X_test)

In [29]:
y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
confusion_matrix_epidural_logistic_regression = confusion_matrix(y_test, y_predict)

In [31]:
confusion_matrix_epidural_logistic_regression

array([[1917,   28],
       [  53,    2]])