In [None]:
# Helpfull code from https://www.kaggle.com/code/abhijitsingh001/predicting-gender-of-images/notebook

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,BatchNormalization
from keras.callbacks import EarlyStopping,LearningRateScheduler,ReduceLROnPlateau
from keras.optimizers import SGD,RMSprop
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

from keras.models import Sequential
warnings.filterwarnings('ignore')

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout

In [2]:
# Converting the pixel data
def value_to_image(pixels):
    pixels = np.array(pixels.split(),'float64')
    pixels = np.reshape(pixels,(48,48))
    pixels = pixels / 255.0
    return pixels

In [3]:
def categorical_accuracies(y_true, y_pred):
  # sanity check
  if len(y_true) != len(y_pred):
    print("y_true and y_pred are of different lengths")
    return

  correctness_tracker = {0:0, 1:0,2:0,3:0,4:0}
  sum_tracker = {0:0, 1:0,2:0,3:0,4:0}
  for i in range(0,len(y_true)):
    sum_tracker[y_true[i]] +=1
    if y_true[i] == y_pred[i]:
      correctness_tracker[y_true[i]] += 1

  accuracy = {}
  for key in range(0,5):
    accuracy[key] = correctness_tracker[key] / sum_tracker[key] if sum_tracker[key] >0 else 0
  return accuracy

def count_unique_values(array):
  counter = {0:0, 1:0,2:0,3:0,4:0}
  for element in array:
    counter[element] +=1

  return counter

In [5]:
'''
Data: a dictionary of training and testing
  Training: a pandas table of the original dataset
  Testing: a pandas table of the original dataset
'''

# Changing dimensions of the data
def change_image_dimension(data):
    data = np.reshape(data.to_list(),(len(data),48,48,1))
    return data

def make_model(data):
    training = data["training"]
    testing = data['testing']
    title = data['title']
    train_img = training['pixels'].apply(value_to_image)
    test_img = testing['pixels'].apply(value_to_image)

    train_cancer = training['cancer'] # 1 = there is a tumor, 0 = no tumor
    test_cancer = testing['cancer']

    train_img = change_image_dimension(train_img)
    test_img = change_image_dimension(test_img)

    train_img = train_img/255.0
    test_img = test_img/255.0

    # CNN
    model=Sequential()
    model.add(Conv2D(256,(3,3),activation='relu',padding='same',input_shape=(48,48,1)))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(64,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(32,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(512,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(5, activation='softmax')) # Use softmax as one of the activations, crossentropy for loss

    # Convert labels to categorical
    train_cancer_cat = to_categorical(train_cancer, num_classes=2) # 2 classes for "1" and "0"
    test_cancer_cat = to_categorical(test_cancer, num_classes=2)

    # "accuracy is training accuracy"
    model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
    r2 = model.fit(train_img, train_cancer_cat, epochs=7)

    model.summary()

    # Plotting losses
    plt.close()
    plt.plot(r2.history['loss'], label='loss for'+ title)
    plt.legend(loc="upper left")
    plt.title('Loss over epoch for' + title )
    plt.show()
    plt.close()

    # Plotting accuracy
    plt.plot(r2.history['accuracy'], label='accuracy for '+ title)
    plt.legend(loc="upper left")
    plt.title('Accuracy over epoch' + title)
    plt.show()

    y_pred = model.predict(test_img)
    y_pred = np.argmax(y_pred, axis=1)

    cm = confusion_matrix(np.array(testing['cancer']), y_pred)

    # Plotting the Confusion Matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues',
      xticklabels=['Tumor present', 'No tumor present'],
      yticklabels=['Tumor present', 'No tumor present'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for {title}')
    plt.show()

    print(classification_report(y_true=np.array(testing['cancer']), y_pred=y_pred))
    print(categorical_accuracies(y_true=np.array(testing['cancer']), y_pred=y_pred))

    print(count_unique_values(np.array(testing['cancer'])))
    print(count_unique_values(y_pred))

    print('training accuracies')
    print(r2.history["accuracy"][-1])
    y_true = np.array(training['cancer'])
    y_pred = model.predict(train_img)
    y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_true=y_true, y_pred=y_pred))

In [9]:
import os
import subprocess
!find . -name ".DS_Store" -delete

# TODO: Need to make a for-loop that goes through every folder in images and annotations. This is 
# because the way the code works, it only goes through one folder at a time, so we need a loop
# that calls big_helper.py multiple times. I don't care about runtime anymore...

# NOTE: There will be Not-Found errors because not all the image files are downloaded even though all
# the annotations are downloaded.

# for folder in os.listdir("dl_data/annotations"):
#     print(folder)
#     print(dicom_path)
#     dicom_path = f"dl_data/images/Lung_Dx-{folder}"
#     annotation_path = f"dl_data/annotations/{folder}" 
    
#     command = [
#         "python", "big_helper.py", "--dicom-mode", "CT",
#         "--dicom-path", dicom_path,
#         "--annotation-path", annotation_path,
#         "--classfile", "category.txt"
#     ]
    
#     # Execute the command
#     subprocess.run(command)

!python preprocess.py --dicom-mode CT --dicom-path dl_data/images/Lung_Dx-A0001 --annotation-path dl_data/annotations/A0001 --classfile category.txt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
dict_items([('1.3.6.1.4.1.14519.5.2.1.6655.2359.289186849349654450104973032531.xml', array([[293., 318., 348., 384.,   1.,   0.,   0.,   0.]])), ('1.3.6.1.4.1.14519.5.2.1.6655.2359.218387194706247505045205169130.xml', array([[287., 315., 355., 387.,   1.,   0.,   0.,   0.]])), ('1.3.6.1.4.1.14519.5.2.1.6655.2359.191468104323452587789797624004.xml', array([[285., 319., 355., 389.,   1.,   0.,   0.,   0.]])), ('1.3.6.1.4.1.14519.5.2.1.6655.2359.185555685346057091761687385642.xml', array([[298., 298., 335., 374.,   1.,   0.,   0.,   0.]])), ('1.3.6.1.4.1.14519.5.2.1.6655.2359.

In [None]:
#old code
basecase = {"training": pd.read_csv("/content/drive/MyDrive/S6/dl_final_project/base_training.csv"),
            "testing": pd.read_csv("/content/drive/MyDrive/S6/dl_final_project/base_testing.csv"),
            "title": 'basecase'}

make_model(basecase)