In [1]:
import os
from os.path import exists
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFilter
from skimage.transform import downscale_local_mean
from skimage import color
import json
import csv

Annotating a Sample Image

In [3]:
epidural_csv_file = 'segmentations/Results_Epidural Hemorrhage Detection_2020-11-16_21.31.26.148.csv'
epidural_image_repo = 'epidural/max_contrast_window'
epidural_image_file = 'ID_004c4b319.jpg'

In [4]:
def get_coords(image, image_csv_file, image_file_name):
    labels = get_labels_csv(image_csv_file, image_file_name)
    coord_list = []
    for label in labels:
        coord = (label['x'] * image.size[0], label['y'] * image.size[1])
        coord_list.append(coord)
    return coord_list
def get_labels_csv(image_csv_file, image_file_name):
    with open(image_csv_file) as csv_file:
        data_reader = csv.reader(csv_file, dialect='excel')
        for row in data_reader:
            if row[1] == image_file_name:
                return json.loads(row[7])
    return None

In [5]:
im = Image.open(f'{epidural_image_repo}/{epidural_image_file}')
features = []
for x in range(im.size[0]):
    for y in range(20):
        features.append(im.getpixel((x, y))[0])
        features.append(im.getpixel((x, y))[1])
        features.append(im.getpixel((x, y))[2])
draw = ImageDraw.Draw(im)
draw.line(get_coords(im, epidural_csv_file, epidural_image_file), fill='red')
im.show()

Reading an Image and Its Classification

In [35]:
examples_per_type = 2000
num_features = 16384
standard_height, standard_width = 512, 512
types = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
image_data_column_list = [i for i in range(num_features)]

In [36]:
def get_valid_normal_images():
    return [file_name[:-4] for file_name in os.listdir('normal/max_contrast_window') if exists(f'normal/max_contrast_window/{file_name}')]

In [4]:
def get_malformed_images():
    images = []
    directories = ['normal/max_contrast_window',
                   'subdural/max_contrast_window', 
                   'multi/max_contrast_window', 
                   'epidural/max_contrast_window', 
                   'intraparenchymal/max_contrast_window', 
                   'intraventricular/max_contrast_window', 
                   'subarachnoid/max_contrast_window']
    for directory in directories:
        for filename in os.listdir(directory):
            im = Image.open(f'{directory}/{filename}')
            width, height = im.size
            if width != standard_width and height != standard_height:
                images.append(filename[:-4])
    return images

In [5]:
malformed_images = get_malformed_images() 
print(len(malformed_images))

87


In [6]:
valid_normal_images = get_valid_normal_images()
print(len(valid_normal_images))

8599


In [37]:
labels = pd.read_csv('segmentations/hemorrhage-labels.csv')
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752798,ID_ffff82e46,0,0,0,0,0,0
752799,ID_ffff922b9,1,0,0,1,0,0
752800,ID_ffffb670a,1,0,0,0,1,0
752801,ID_ffffcbff8,0,0,0,0,0,0


In [38]:
labels = labels[labels.Image.isin(valid_normal_images) | labels['any'] == 1]
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [39]:
labels = labels[~labels.Image.isin(malformed_images)]
labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
752755,ID_fffc60817,1,0,1,1,0,0
752769,ID_fffd00949,1,0,0,0,1,0
752783,ID_fffe2edb8,1,0,1,1,0,0
752799,ID_ffff922b9,1,0,0,1,0,0


In [40]:
normal_labels = labels[labels['any'] == 0].iloc[:examples_per_type]
normal_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2337,ID_00c96bbe3,0,0,0,0,0,0
2338,ID_00c970892,0,0,0,0,0,0
2339,ID_00c9ed7c0,0,0,0,0,0,0
2340,ID_00c9f0494,0,0,0,0,0,0


In [41]:
hemorrhage_labels = dict()
for hemorrhage in types:
    hemorrhage_labels[hemorrhage] = labels[labels[hemorrhage] == 1].iloc[:examples_per_type]
hemorrhage_labels['epidural']

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
164,ID_000edbf38,1,1,0,0,0,0
271,ID_00178eb80,1,1,0,0,0,0
320,ID_001bb2c00,1,1,0,0,0,0
443,ID_0026de01c,1,1,0,0,0,0
828,ID_004966d37,1,1,1,1,0,0
...,...,...,...,...,...,...,...
475467,ID_a145c6844,1,1,0,0,0,0
475509,ID_a148f85a5,1,1,0,0,1,0
475656,ID_a1569b021,1,1,0,0,0,0
475826,ID_a1665e294,1,1,0,0,0,0


In [42]:
final_labels = normal_labels
for hemorrhage in types:
    final_labels = final_labels.append(hemorrhage_labels[hemorrhage])
final_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
31265,ID_0a996b0e3,1,0,0,0,0,1
31286,ID_0a9ad2967,1,0,0,0,0,1
31307,ID_0a9ce6304,1,0,0,0,0,1
31315,ID_0a9d99e97,1,0,0,0,0,1


In [43]:
final_labels = final_labels.drop_duplicates().reset_index(drop=True)
final_labels

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0
...,...,...,...,...,...,...,...
9934,ID_0a996b0e3,1,0,0,0,0,1
9935,ID_0a9ad2967,1,0,0,0,0,1
9936,ID_0a9ce6304,1,0,0,0,0,1
9937,ID_0a9d99e97,1,0,0,0,0,1


In [44]:
def get_downsampled_images():
    return final_labels.apply(lambda row: get_downsampled_data(row), axis=1, result_type='expand')
def get_downsampled_data(row):
    if int(row.name) % 100 == 0:
        print(row.name)
    if row[1] == 0:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'normal/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[2] + row[3] + row[4] + row[5] + row[6] > 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'multi/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[2] == 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'epidural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[3] == 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'intraparenchymal/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[4] == 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'intraventricular/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[5] == 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'subarachnoid/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    elif row[6] == 1:
        temp = downscale_local_mean(color.rgb2gray(plt.imread(f'subdural/max_contrast_window/{row[0]}.jpg')), (4,4)).flatten()
    else:
        temp = np.zeros(num_features)
    if temp.size != num_features:
        temp = np.zeros(num_features)
    return temp

In [45]:
image_data = get_downsampled_images()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900


In [46]:
image_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.140331,0.167292,0.140821,0.123174,0.133713,0.145723,0.145968,0.135919,0.152341,0.141802,...,0.194869,0.206861,0.218407,0.202475,0.185635,0.180681,0.187481,0.203468,0.198811,0.192507
4,0.025176,0.023792,0.022725,0.013271,0.013980,0.014715,0.011721,0.007694,0.007085,0.005036,...,0.618105,0.606711,0.590844,0.585452,0.575893,0.562903,0.551240,0.529671,0.521828,0.516681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9934,0.048842,0.027571,0.048842,0.042767,0.043591,0.042522,0.033401,0.052905,0.035362,0.039581,...,0.526937,0.486986,0.499191,0.505363,0.491521,0.433724,0.428508,0.455707,0.456685,0.491380
9935,0.063724,0.062885,0.062045,0.058859,0.062111,0.082350,0.057402,0.077010,0.050049,0.060150,...,0.457435,0.482435,0.577657,0.699713,0.802157,0.797973,0.694015,0.554718,0.470573,0.449520
9936,0.083014,0.098946,0.081544,0.094516,0.073906,0.079735,0.077267,0.083657,0.068706,0.080716,...,0.112695,0.115716,0.138122,0.097190,0.128808,0.092533,0.096596,0.116478,0.153357,0.233834
9937,0.043993,0.046443,0.043852,0.046830,0.045448,0.045218,0.036780,0.035013,0.045359,0.036097,...,0.050531,0.063522,0.077296,0.072884,0.105973,0.068227,0.075580,0.087100,0.067982,0.058424


In [47]:
total_data = pd.concat([final_labels, image_data], axis=1)

In [48]:
total_data = total_data[(total_data[image_data_column_list].sum(axis=1) != 0)]
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_000012eaf,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ID_000039fa0,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ID_00005679d,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00008ce3c,0,0,0,0,0,0,0.140331,0.167292,0.140821,...,0.194869,0.206861,0.218407,0.202475,0.185635,0.180681,0.187481,0.203468,0.198811,0.192507
4,ID_0000950d7,0,0,0,0,0,0,0.025176,0.023792,0.022725,...,0.618105,0.606711,0.590844,0.585452,0.575893,0.562903,0.551240,0.529671,0.521828,0.516681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9934,ID_0a996b0e3,1,0,0,0,0,1,0.048842,0.027571,0.048842,...,0.526937,0.486986,0.499191,0.505363,0.491521,0.433724,0.428508,0.455707,0.456685,0.491380
9935,ID_0a9ad2967,1,0,0,0,0,1,0.063724,0.062885,0.062045,...,0.457435,0.482435,0.577657,0.699713,0.802157,0.797973,0.694015,0.554718,0.470573,0.449520
9936,ID_0a9ce6304,1,0,0,0,0,1,0.083014,0.098946,0.081544,...,0.112695,0.115716,0.138122,0.097190,0.128808,0.092533,0.096596,0.116478,0.153357,0.233834
9937,ID_0a9d99e97,1,0,0,0,0,1,0.043993,0.046443,0.043852,...,0.050531,0.063522,0.077296,0.072884,0.105973,0.068227,0.075580,0.087100,0.067982,0.058424


In [49]:
# the line of code below loads the 'all labels' field in the csv file in json
# coords_lists = json.loads(row[4].replace("'[", "[").replace("]'", "],").replace(',]', ']'))

Logistic Regression Model

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [51]:
total_data

Unnamed: 0,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,0,1,2,...,16374,16375,16376,16377,16378,16379,16380,16381,16382,16383
0,ID_000012eaf,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ID_000039fa0,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ID_00005679d,0,0,0,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ID_00008ce3c,0,0,0,0,0,0,0.140331,0.167292,0.140821,...,0.194869,0.206861,0.218407,0.202475,0.185635,0.180681,0.187481,0.203468,0.198811,0.192507
4,ID_0000950d7,0,0,0,0,0,0,0.025176,0.023792,0.022725,...,0.618105,0.606711,0.590844,0.585452,0.575893,0.562903,0.551240,0.529671,0.521828,0.516681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9934,ID_0a996b0e3,1,0,0,0,0,1,0.048842,0.027571,0.048842,...,0.526937,0.486986,0.499191,0.505363,0.491521,0.433724,0.428508,0.455707,0.456685,0.491380
9935,ID_0a9ad2967,1,0,0,0,0,1,0.063724,0.062885,0.062045,...,0.457435,0.482435,0.577657,0.699713,0.802157,0.797973,0.694015,0.554718,0.470573,0.449520
9936,ID_0a9ce6304,1,0,0,0,0,1,0.083014,0.098946,0.081544,...,0.112695,0.115716,0.138122,0.097190,0.128808,0.092533,0.096596,0.116478,0.153357,0.233834
9937,ID_0a9d99e97,1,0,0,0,0,1,0.043993,0.046443,0.043852,...,0.050531,0.063522,0.077296,0.072884,0.105973,0.068227,0.075580,0.087100,0.067982,0.058424


Logistic Regression Models

Any Hemorrhage

In [52]:
X_train, X_test, y_train, y_test = train_test_split(total_data[image_data_column_list], total_data['any'], test_size=0.2, random_state=0)
model = LogisticRegression(random_state=0).fit(X_train, y_train)
print(f'Score: {model.score(X_test, y_test)}')
y_predict = model.predict(X_test)
results = confusion_matrix(y_test, y_predict)
print(f'Precision: {results[1][1]/(results[1][1] + results[0][1])}')
print(f'Recall: {results[1][1]/(results[1][1] + results[1][0])}')
print(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.8254527162977867
Precision: 0.870023419203747
Recall: 0.9224084419615146
[[ 155  222]
 [ 125 1486]]


Specific Hemorrhages

In [53]:
for hemorrhage in types:
    print(f'Building classifier for type {hemorrhage}')
    X_train, X_test, y_train, y_test = train_test_split(total_data[image_data_column_list], total_data[hemorrhage], test_size=0.2, random_state=0)
    model = LogisticRegression(random_state=0).fit(X_train, y_train)
    print(f'Score: {model.score(X_test, y_test)}')
    y_predict = model.predict(X_test)
    results = confusion_matrix(y_test, y_predict)
    print(f'Precision: {results[1][1]/(results[1][1] + results[0][1])}')
    print(f'Recall: {results[1][1]/(results[1][1] + results[1][0])}')
    print(results)

Building classifier for type epidural


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.7862173038229376
Precision: 0.4680232558139535
Recall: 0.39950372208436724
[[1402  183]
 [ 242  161]]
Building classifier for type intraparenchymal


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.7137826961770624
Precision: 0.40114613180515757
Recall: 0.28
[[1279  209]
 [ 360  140]]
Building classifier for type intraventricular


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.7676056338028169
Precision: 0.4454828660436137
Recall: 0.33489461358313816
[[1383  178]
 [ 284  143]]
Building classifier for type subarachnoid


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.6971830985915493
Precision: 0.38823529411764707
Recall: 0.2509505703422053
[[1254  208]
 [ 394  132]]
Building classifier for type subdural


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.6891348088531187
Precision: 0.4005102040816326
Recall: 0.29074074074074074
[[1213  235]
 [ 383  157]]


Things to Try: different number of examples, different picture types (max_contrast vs other types), RGB vs grayscale, all pixels vs downsampling, different solvers for logistic regression, different number of iterations