In [None]:
# Helpfull code from https://www.kaggle.com/code/abhijitsingh001/predicting-gender-of-images/notebook

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,BatchNormalization
from keras.callbacks import EarlyStopping,LearningRateScheduler,ReduceLROnPlateau
from keras.optimizers import SGD,RMSprop
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

from keras.models import Sequential
warnings.filterwarnings('ignore')

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout

In [2]:
# Converting the pixel data
def value_to_image(pixels):
    pixels = np.array(pixels.split(),'float64')
    pixels = np.reshape(pixels,(48,48))
    pixels = pixels / 255.0
    return pixels

In [3]:
def categorical_accuracies(y_true, y_pred):
  # sanity check
  if len(y_true) != len(y_pred):
    print("y_true and y_pred are of different lengths")
    return

  correctness_tracker = {0:0, 1:0,2:0,3:0,4:0}
  sum_tracker = {0:0, 1:0,2:0,3:0,4:0}
  for i in range(0,len(y_true)):
    sum_tracker[y_true[i]] +=1
    if y_true[i] == y_pred[i]:
      correctness_tracker[y_true[i]] += 1

  accuracy = {}
  for key in range(0,5):
    accuracy[key] = correctness_tracker[key] / sum_tracker[key] if sum_tracker[key] >0 else 0
  return accuracy

def count_unique_values(array):
  counter = {0:0, 1:0,2:0,3:0,4:0}
  for element in array:
    counter[element] +=1

  return counter

In [5]:
'''
Data: a dictionary of training and testing
  Training: a pandas table of the original dataset
  Testing: a pandas table of the original dataset
'''

# Changing dimensions of the data
def change_image_dimension(data):
    data = np.reshape(data.to_list(),(len(data),48,48,1))
    return data

def make_model(data):
    training = data["training"]
    testing = data['testing']
    title = data['title']
    train_img = training['pixels'].apply(value_to_image)
    test_img = testing['pixels'].apply(value_to_image)

    train_cancer = training['cancer'] # 1 = there is a tumor, 0 = no tumor
    test_cancer = testing['cancer']

    train_img = change_image_dimension(train_img)
    test_img = change_image_dimension(test_img)

    train_img = train_img/255.0
    test_img = test_img/255.0

    # CNN
    model=Sequential()
    model.add(Conv2D(256,(3,3),activation='relu',padding='same',input_shape=(48,48,1)))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(64,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(32,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(512,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(5, activation='softmax')) # Use softmax as one of the activations, crossentropy for loss

    # Convert labels to categorical
    train_cancer_cat = to_categorical(train_cancer, num_classes=2) # 2 classes for "1" and "0"
    test_cancer_cat = to_categorical(test_cancer, num_classes=2)

    # "accuracy is training accuracy"
    model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
    r2 = model.fit(train_img, train_cancer_cat, epochs=7)

    model.summary()

    # Plotting losses
    plt.close()
    plt.plot(r2.history['loss'], label='loss for'+ title)
    plt.legend(loc="upper left")
    plt.title('Loss over epoch for' + title )
    plt.show()
    plt.close()

    # Plotting accuracy
    plt.plot(r2.history['accuracy'], label='accuracy for '+ title)
    plt.legend(loc="upper left")
    plt.title('Accuracy over epoch' + title)
    plt.show()

    y_pred = model.predict(test_img)
    y_pred = np.argmax(y_pred, axis=1)

    cm = confusion_matrix(np.array(testing['cancer']), y_pred)

    # Plotting the Confusion Matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues',
      xticklabels=['Tumor present', 'No tumor present'],
      yticklabels=['Tumor present', 'No tumor present'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for {title}')
    plt.show()

    print(classification_report(y_true=np.array(testing['cancer']), y_pred=y_pred))
    print(categorical_accuracies(y_true=np.array(testing['cancer']), y_pred=y_pred))

    print(count_unique_values(np.array(testing['cancer'])))
    print(count_unique_values(y_pred))

    print('training accuracies')
    print(r2.history["accuracy"][-1])
    y_true = np.array(training['cancer'])
    y_pred = model.predict(train_img)
    y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_true=y_true, y_pred=y_pred))

In [9]:
import os
import subprocess
!find . -name ".DS_Store" -delete

# TODO: Need to make a for-loop that goes through every folder in images and annotations. This is 
# because the way the code works, it only goes through one folder at a time, so we need a loop
# that calls big_helper.py multiple times. I don't care about runtime anymore...

# NOTE: There will be Not-Found errors because not all the image files are downloaded even though all
# the annotations are downloaded.

# # Some of the stencil code uses depricated code, but it is still needed
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

for folder in os.listdir("dl_data/annotations"):
    dicom_path = f"dl_data/images/Lung_Dx-{folder}"
    annotation_path = f"dl_data/annotations/{folder}" 

    # Need to make this if-statement dynamic
    if folder != "A0001" or folder != "A0002" or folder != "A0003" or folder != "A0004":
        continue
    else:
        print(folder)
    
    command = [
        "python", "preprocess.py", "--dicom-mode", "CT",
        "--dicom-path", dicom_path,
        "--annotation-path", annotation_path,
        "--classfile", "category.txt"
    ]
    
    # Execute the command
    subprocess.run(command)

#!python preprocess.py --dicom-mode CT --dicom-path dl_data/images/Lung_Dx-A0001 --annotation-path dl_data/annotations/A0001 --classfile category.txt

A0174
A0180
E0001
G0062
A0187
A0173
G0054
A0145
A0189
A0142
G0053
A0116
G0007
A0129
G0038
A0111
G0036
A0127
G0009
A0118
A0120
G0031
A0188
G0052
A0143
A0144
G0055
A0181
A0175
G0030
A0121
A0126
G0037
A0119
G0008
A0110
G0001
G0006
A0117
G0039
A0128
A0065
A0257
A0091
A0096
A0250
A0062
A0259
A0266
A0054
A0098
A0053
A0261
A0235
A0007
A0038
B0037
A0232
B0008
B0001
A0036
A0204
A0009
A0203
A0031
B0006
A0099
A0260
A0052
A0258
A0055
A0063
A0097
A0090
A0256
A0064
B0007
A0030
A0202
B0038
A0205
A0037
A0008
A0233
A0001
B0036
B0009
B0031
A0006
A0234
A0039
A0229
B0013
A0216
A0024
A0023
A0211
B0014
A0218
A0015
A0227
B0022
B0025
A0220
A0012
A0046
A0041
A0077
B0040
A0083
A0048
A0084
A0070
A0242
A0013
A0221
B0024
B0023
A0226
A0014
B0015
A0210
A0022
A0228
A0025
A0217
B0012
A0243
A0071
A0085
B0041
A0082
A0076
A0244
A0049
A0040
A0047
A0078
G0024
A0135
A0132
G0023
A0104
G0015
G0012
A0103
G0046
A0157
A0168
A0150
G0041
A0166
A0192
A0159
G0048
A0195
A0161
A0102
G0013
G0014
A0105
G0022
A0133
A0134
G0025
A0160
A019

In [None]:
#old code
basecase = {"training": pd.read_csv("/content/drive/MyDrive/S6/dl_final_project/base_training.csv"),
            "testing": pd.read_csv("/content/drive/MyDrive/S6/dl_final_project/base_testing.csv"),
            "title": 'basecase'}

make_model(basecase)