In [1]:
import os
from os.path import exists
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFilter
from skimage.transform import downscale_local_mean
from skimage import color
import json
import csv

Annotating a Sample Image

In [3]:
epidural_csv_file = 'segmentations/Results_Epidural Hemorrhage Detection_2020-11-16_21.31.26.148.csv'
epidural_image_repo = 'epidural/max_contrast_window'
epidural_image_file = 'ID_004c4b319.jpg'

In [4]:
def get_coords(image, image_csv_file, image_file_name):
    labels = get_labels_csv(image_csv_file, image_file_name)
    coord_list = []
    for label in labels:
        coord = (label['x'] * image.size[0], label['y'] * image.size[1])
        coord_list.append(coord)
    return coord_list
def get_labels_csv(image_csv_file, image_file_name):
    with open(image_csv_file) as csv_file:
        data_reader = csv.reader(csv_file, dialect='excel')
        for row in data_reader:
            if row[1] == image_file_name:
                return json.loads(row[7])
    return None

In [5]:
im = Image.open(f'{epidural_image_repo}/{epidural_image_file}')
features = []
for x in range(im.size[0]):
    for y in range(20):
        features.append(im.getpixel((x, y))[0])
        features.append(im.getpixel((x, y))[1])
        features.append(im.getpixel((x, y))[2])
draw = ImageDraw.Draw(im)
draw.line(get_coords(im, epidural_csv_file, epidural_image_file), fill='red')
im.show()

Reading an Image and Its Classification

In [2]:
num_examples = 1000
num_features = 16384
standard_height, standard_width = 512, 512
types = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']

In [5]:
def get_valid_normal_images():
    return [file_name[:-4] for file_name in os.listdir('normal/max_contrast_window') if exists(f'normal/max_contrast_window/{file_name}')]

In [4]:
def get_malformed_images():
    images = []
    directories = ['normal/max_contrast_window',
                   'subdural/max_contrast_window', 
                   'multi/max_contrast_window', 
                   'epidural/max_contrast_window', 
                   'intraparenchymal/max_contrast_window', 
                   'intraventricular/max_contrast_window', 
                   'subarachnoid/max_contrast_window']
    for directory in directories:
        for filename in os.listdir(directory):
            im = Image.open(f'{directory}/{filename}')
            width, height = im.size
            if width != standard_width and height != standard_height:
                images.append(filename[:-4])
    return images

In [6]:
malformed_images = get_malformed_images() 
print(len(malformed_images))

87


In [7]:
valid_normal_images = get_valid_normal_images()
print(len(valid_normal_images))

8599


In [8]:
labels = pd.read_csv('segmentations/hemorrhage-labels.csv')
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752798,ID_ffff82e46,0,0,0,0,0,0
752799,ID_ffff922b9,1,0,0,1,0,0
752800,ID_ffffb670a,1,0,0,0,1,0
752801,ID_ffffcbff8,0,0,0,0,0,0


In [10]:
labels = labels[labels.Image.isin(valid_normal_images) | labels['any'] == 1]
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [11]:
labels = labels[~labels.Image.isin(malformed_images)]
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [12]:
normal_labels = labels[labels['any'] == 0].iloc[:num_examples]
normal_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
1157,ID_00645f5a5,0,0,0,0,0,0
1159,ID_0064b170f,0,0,0,0,0,0
1160,ID_0064c845f,0,0,0,0,0,0
1161,ID_0064d849b,0,0,0,0,0,0


In [13]:
hemorrhage_labels = dict()
for hemorrhage in types:
    hemorrhage_labels[hemorrhage] = labels[labels[hemorrhage] == 1].iloc[:num_examples]
hemorrhage_labels['epidural']

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
164,ID_000edbf38,1,1,0,0,0,0
271,ID_00178eb80,1,1,0,0,0,0
320,ID_001bb2c00,1,1,0,0,0,0
443,ID_0026de01c,1,1,0,0,0,0
828,ID_004966d37,1,1,1,1,0,0
...,...,...,...,...,...,...,...
235972,ID_502311e49,1,1,0,0,0,0
236034,ID_5027c16cb,1,1,0,0,0,0
236307,ID_503e03b58,1,1,0,0,0,0
236312,ID_503e6a75c,1,1,0,0,1,1


In [14]:
final_labels = normal_labels
for hemorrhage in types:
    final_labels = final_labels.append(hemorrhage_labels[hemorrhage])
final_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
15420,ID_0547b8984,1,0,0,0,0,1
15423,ID_0547e38e8,1,0,0,0,0,1
15427,ID_054829f1e,1,0,0,0,1,1
15446,ID_054984567,1,0,0,0,0,1


In [15]:
final_labels = final_labels.drop_duplicates().reset_index(drop=True)
final_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
4959,ID_0546ed6b7,1,0,0,0,0,1
4960,ID_0547983e4,1,0,0,0,0,1
4961,ID_0547b8984,1,0,0,0,0,1
4962,ID_0547e38e8,1,0,0,0,0,1


In [28]:
def get_grayscale_image_data():
    return final_labels.apply(lambda row: get_data(row), axis=1, result_type='expand')
def get_grayscale_data(row):
    if row[2] + row[3] + row[4] + row[5] + row[6] > 1:
        return color.rgb2gray(plt.imread(f'multi/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[2] == 1:
        return color.rgb2gray(plt.imread(f'epidural/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[3] == 1:
        return color.rgb2gray(plt.imread(f'intraparenchymal/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[4] == 1:
        return color.rgb2gray(plt.imread(f'intraventricular/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[5] == 1:
        return color.rgb2gray(plt.imread(f'subarachnoid/max_contrast_window/{row[0]}.jpg')).flatten()
    elif row[6] == 1:
        return color.rgb2gray(plt.imread(f'subdural/max_contrast_window/{row[0]}.jpg')).flatten()
    return []
def get_downsampled_images():
    return final_labels.apply(lambda row: get_downsampled_data(row), axis=1, result_type='expand')
def get_downsampled_data(row):
    if int(row.name) % 100 == 0:
        print(row.name)
    if row[1] == 0:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'normal/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[2] + row[3] + row[4] + row[5] + row[6] > 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'multi/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[2] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'epidural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[3] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'intraparenchymal/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[4] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'intraventricular/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[5] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'subarachnoid/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[6] == 1:
        temp = downscale_local_mean(
            color.rgb2gray(
                plt.imread(f'subdural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    else:
        temp = np.zeros(num_features)
    if temp.size != num_features:
        temp = np.zeros(num_features)
    return temp

In [29]:
image_data = get_downsampled_images()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


In [30]:
image_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.140331,0.167292,0.140821,0.123174,0.133713,0.145723,0.145968,0.135919,0.152341,0.141802,...,0.194869,0.206861,0.218407,0.202475,0.185635,0.180681,0.187481,0.203468,0.198811,0.192507
4,0.025176,0.023792,0.022725,0.013271,0.013980,0.014715,0.011721,0.007694,0.007085,0.005036,...,0.618105,0.606711,0.590844,0.585452,0.575893,0.562903,0.551240,0.529671,0.521828,0.516681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959,0.061078,0.060640,0.056614,0.060046,0.071477,0.052552,0.047702,0.070147,0.055634,0.067592,...,0.356424,0.371865,0.367208,0.354218,0.337796,0.354953,0.345884,0.361571,0.368188,0.360590
4960,0.046659,0.074355,0.079992,0.063325,0.080482,0.093963,0.075090,0.068963,0.072873,0.057677,...,0.091022,0.065776,0.080482,0.088080,0.084159,0.097394,0.084894,0.102786,0.090531,0.095924
4961,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4962,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [31]:
total_data = pd.concat([final_labels, image_data], axis=1)

In [32]:
total_data = total_data[(total_data[[i for i in range(num_features)]].sum(axis=1) != 0)]
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_000012eaf,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ID_000039fa0,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ID_00005679d,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00008ce3c,0,0,0,0,0,0,0.140331,0.167292,0.140821,...,0.194869,0.206861,0.218407,0.202475,0.185635,0.180681,0.187481,0.203468,0.198811,0.192507
4,ID_0000950d7,0,0,0,0,0,0,0.025176,0.023792,0.022725,...,0.618105,0.606711,0.590844,0.585452,0.575893,0.562903,0.551240,0.529671,0.521828,0.516681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959,ID_0546ed6b7,1,0,0,0,0,1,0.061078,0.060640,0.056614,...,0.356424,0.371865,0.367208,0.354218,0.337796,0.354953,0.345884,0.361571,0.368188,0.360590
4960,ID_0547983e4,1,0,0,0,0,1,0.046659,0.074355,0.079992,...,0.091022,0.065776,0.080482,0.088080,0.084159,0.097394,0.084894,0.102786,0.090531,0.095924
4961,ID_0547b8984,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4962,ID_0547e38e8,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [18]:
# legacy function: loads data for a particular classification into a DataFrame
#def get_images(classification, max_size=1000000): 
#    classification_labels = labels[(labels[classification] == 1) & (labels[columns].sum(axis=1) == 1)]
#    data = np.zeros((classification_labels.shape[0], num_features))
#    for index, value in enumerate(classification_labels['Image'].items()):
#        data[index] = plt.imread(f'{classification}/max_contrast_window/{value[1]}.jpg').flatten()
#        if index == max_size:
#            break
#    return data
#epidural = get_images('epidural', 1000)
#intraparenchymal = get_images('intraparenchymal', 1000)
#intraventricular = get_images('intraventricular', 1000)
#subarachnoid = get_images('subarachnoid', 1000)
#subdural = get_images('subdural', 1000)

In [19]:
# legacy function: loads data for a the "multi" classification into a DataFrame
#def get_multi_class_images(max_size=1000000):
#    classification_labels = labels[(labels[columns].sum(axis=1) > 1)]
#    data = np.zeros((classification_labels.shape[0], num_features))
#    for index, value in enumerate(classification_labels['Image'].items()):
#        data[index] = plt.imread(f'multi/max_contrast_window/{value[1]}.jpg').flatten()
#        if index == max_size:
#            break
#    return data
#multi = get_multi_class_images(1000)

In [20]:
# the line of code below loads the 'all labels' field in the csv file in json
# coords_lists = json.loads(row[4].replace("'[", "[").replace("]'", "],").replace(',]', ']'))

Logistic Regression Model

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


In [22]:
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_0002081b6,1,0,1,0,0,0,0.108220,0.095720,0.082729,...,0.270148,0.273564,0.278281,0.281942,0.282239,0.286479,0.284163,0.284163,0.285144,0.285144
1,ID_0002a38ad,1,0,0,0,1,1,0.105278,0.096455,0.091553,...,0.247717,0.272717,0.263799,0.283897,0.298848,0.293456,0.314991,0.304206,0.321118,0.300775
2,ID_000346ce2,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00042829c,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ID_0004a5701,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ID_179d2ca85,1,0,0,1,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9996,ID_179d7380f,1,0,0,0,0,1,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9997,ID_179d9a649,1,0,0,1,0,0,0.055780,0.072550,0.075930,...,0.591351,0.591351,0.583491,0.583491,0.583981,0.573687,0.566579,0.556775,0.538638,0.518540
9998,ID_179e1a9b9,1,0,0,0,0,1,0.104053,0.106014,0.097680,...,0.166310,0.167152,0.189595,0.182504,0.184447,0.190312,0.187236,0.194030,0.192138,0.192138


In [23]:
X_train, X_test, y_train, y_test = train_test_split(total_data[column_list], total_data['epidural'], test_size=0.2, random_state=0)

In [47]:
epidural_logistic_pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100,solver='sag'))
epidural_logistic_pipe.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(max_iter=250, solver='sag'))])

In [48]:
epidural_logistic_pipe.score(X_test, y_test)

0.9715

In [49]:
y_predict = epidural_logistic_pipe.predict(X_test)

In [50]:
confusion_matrix_epidural_logistic = confusion_matrix(y_test, y_predict)

In [51]:
confusion_matrix_epidural_logistic

array([[1942,    3],
       [  54,    1]])