In [10]:
# Helpfull code from https://www.kaggle.com/code/abhijitsingh001/predicting-gender-of-images/notebook

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,BatchNormalization
from keras.callbacks import EarlyStopping,LearningRateScheduler,ReduceLROnPlateau
from keras.optimizers import SGD,RMSprop
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

from keras.models import Sequential
warnings.filterwarnings('ignore')

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout

In [11]:
# Converting the pixel data
def value_to_image(pixels):
    pixels = np.array(pixels.split(),'float64')
    pixels = np.reshape(pixels,(48,48))
    pixels = pixels / 255.0
    return pixels

In [12]:
def categorical_accuracies(y_true, y_pred):
  # sanity check
  if len(y_true) != len(y_pred):
    print("y_true and y_pred are of different lengths")
    return

  correctness_tracker = {0:0, 1:0,2:0,3:0,4:0}
  sum_tracker = {0:0, 1:0,2:0,3:0,4:0}
  for i in range(0,len(y_true)):
    sum_tracker[y_true[i]] +=1
    if y_true[i] == y_pred[i]:
      correctness_tracker[y_true[i]] += 1

  accuracy = {}
  for key in range(0,5):
    accuracy[key] = correctness_tracker[key] / sum_tracker[key] if sum_tracker[key] >0 else 0
  return accuracy

def count_unique_values(array):
  counter = {0:0, 1:0,2:0,3:0,4:0}
  for element in array:
    counter[element] +=1

  return counter

In [17]:
'''
Data: a dictionary of training and testing
  Training: a pandas table of the original dataset
  Testing: a pandas table of the original dataset
'''

# Changing dimensions of the data
def change_image_dimension(data):
    data = np.reshape(data.to_list(),(len(data),48,48,1))
    return data

def make_model(data):
    training = data["training"]
    testing = data['testing']
    title = data['title']
    train_img = training['Pixel Data'].apply(value_to_image)
    test_img = testing['Pixel Data'].apply(value_to_image)

    train_cancer = training['Cancer Type']
    test_cancer = testing['Cancer Type']

    train_img = change_image_dimension(train_img)
    test_img = change_image_dimension(test_img)

    train_img = train_img/255.0
    test_img = test_img/255.0

    # CNN
    model=Sequential()
    model.add(Conv2D(256,(3,3),activation='relu',padding='same',input_shape=(48,48,1)))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(64,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Conv2D(32,(3,3),activation='relu',padding='same'))
    model.add(MaxPool2D(2,2))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(512,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(5, activation='softmax')) # Use softmax as one of the activations, crossentropy for loss

    # Convert labels to categorical
    train_cancer_cat = to_categorical(train_cancer, num_classes=2) # 2 classes for "1" and "0"
    test_cancer_cat = to_categorical(test_cancer, num_classes=2)

    # "accuracy is training accuracy"
    model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
    r2 = model.fit(train_img, train_cancer_cat, epochs=7)

    model.summary()

    # Plotting losses
    plt.close()
    plt.plot(r2.history['loss'], label='loss for'+ title)
    plt.legend(loc="upper left")
    plt.title('Loss over epoch for' + title )
    plt.show()
    plt.close()

    # Plotting accuracy
    plt.plot(r2.history['accuracy'], label='accuracy for '+ title)
    plt.legend(loc="upper left")
    plt.title('Accuracy over epoch' + title)
    plt.show()

    y_pred = model.predict(test_img)
    y_pred = np.argmax(y_pred, axis=1)

    cm = confusion_matrix(np.array(testing['Cancer Type']), y_pred)

    # Plotting the Confusion Matrix (TODO: Relabel and redo)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues',
      xticklabels=['Tumor present', 'No tumor present'],
      yticklabels=['Tumor present', 'No tumor present'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for {title}')
    plt.show()

    print(classification_report(y_true=np.array(testing['Cancer Type']), y_pred=y_pred))
    print(categorical_accuracies(y_true=np.array(testing['Cancer Type']), y_pred=y_pred))

    print(count_unique_values(np.array(testing['Cancer Type'])))
    print(count_unique_values(y_pred))

    print('training accuracies')
    print(r2.history["accuracy"][-1])
    y_true = np.array(training['Cancer Type'])
    y_pred = model.predict(train_img)
    y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_true=y_true, y_pred=y_pred))

In [None]:
import os
import subprocess
!find . -name ".DS_Store" -delete

# TODO: Need to make a for-loop that goes through every folder in images and annotations. This is 
# because the way the code works, it only goes through one folder at a time, so we need a loop
# that calls big_helper.py multiple times. I don't care about runtime anymore...

# NOTE: There will be Not-Found errors because not all the image files are downloaded even though all
# the annotations are downloaded.

# # Some of the stencil code uses depricated code, but it is still needed
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

# This code block basically preprocesses the image data and puts it into a CSV file
for folder in os.listdir("dl_data/annotations"):
    dicom_path = f"dl_data/images/Lung_Dx-{folder}"
    annotation_path = f"dl_data/annotations/{folder}" 

    folder_list = ['A0001', 'A0002', 'A0003', 'A0004', 'A0005']
    
    if folder not in folder_list:
        continue
    else:
        print(folder)
    
    command = [
        "python", "preprocess.py", "--dicom-mode", "CT",
        "--dicom-path", dicom_path,
        "--annotation-path", annotation_path,
        "--classfile", "category.txt"
    ]
    
    # Execute the command
    subprocess.run(command)

In [15]:
# This code block will do a train/test split on the data and export it to respective csv files
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def split(df, export_train_path, export_test_path):
    X = df['Pixel Data']
    y = df['Cancer Type']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

    X_train = pd.DataFrame(X_train)
    X_train['Cancer Type'] = y_train

    X_test = pd.DataFrame(X_test)
    X_test['Cancer Type'] = y_test

    X_train.to_csv(export_train_path, index=False)
    X_test.to_csv(export_test_path, index=False)

data = pd.read_csv('base.csv')
data = data[['File Name', 'Pixel Data', 'Cancer Type']]

split(data, 'base_train.csv', 'base_test.csv')

In [18]:
#old code
basecase = {"training": pd.read_csv("base_train.csv"),
            "testing": pd.read_csv("base_test.csv"),
            "title": 'basecase'}

make_model(basecase)

ValueError: cannot reshape array of size 262144 into shape (48,48)