In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,BatchNormalization
from keras.callbacks import EarlyStopping,LearningRateScheduler,ReduceLROnPlateau
from keras.optimizers import SGD,RMSprop
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

from keras.models import Sequential
warnings.filterwarnings('ignore')

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Dropout

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.regularizers import l2

def display_images(desired_shape, images, num_images=6):
    plt.figure(figsize=(12, 6))
    for i in range(num_images):
        ax = plt.subplot(2, 3, i + 1)
        plt.imshow(images[i].reshape(desired_shape[0], desired_shape[1]), cmap='gray')
        plt.axis('off')
    plt.tight_layout()
    plt.show()

def process_pixel_data(pixel_data, desired_shape):
    return np.array([np.array([float(x) for x in item.split()]).reshape(desired_shape) for item in pixel_data])

def process_labels(labels):
    # Convert labels to numerical format if they're not numeric
    unique_labels = {label: idx for idx, label in enumerate(np.unique(labels))}
    return np.array([unique_labels[label] for label in labels]), unique_labels

def make_model(pixel_data, labels):
    # Process pixel data
    input_shape = (128, 128, 1) # Corresponds to target_size in preprocess.py, but with an added dimension
    processed_images = process_pixel_data(pixel_data, desired_shape=input_shape)

    display_images(input_shape, processed_images)

    # Process labels
    processed_labels, label_map = process_labels(labels)
    processed_labels = to_categorical(processed_labels)

    # Splitting data into training and testing sets
    train_img, test_img, train_cancer_cat, test_cancer_cat = train_test_split(
    processed_images, processed_labels, test_size=0.2, random_state=42, stratify=None)

    lambda_val = 0.01
    
    # CNN Model
    model = Sequential([
        Conv2D(256, (3,3), activation='relu', padding='same', input_shape=input_shape, kernel_regularizer=l2(lambda_val)),
        MaxPool2D(2,2),
        BatchNormalization(),
        Conv2D(64, (3,3), activation='relu', padding='same', kernel_regularizer=l2(lambda_val)),
        MaxPool2D(2,2),
        BatchNormalization(),
        Conv2D(32, (3,3), activation='relu', padding='same', kernel_regularizer=l2(lambda_val)),
        MaxPool2D(2,2),
        BatchNormalization(),
        Flatten(),
        Dense(256, activation='relu', kernel_regularizer=l2(lambda_val)),
        Dense(64, activation='relu', kernel_regularizer=l2(lambda_val)),
        Dense(len(label_map), activation='softmax') 
    ])

    model.compile(optimizer=keras.optimizers.legacy.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(train_img, train_cancer_cat, validation_data=(test_img, test_cancer_cat), epochs=30)

    # Evaluation and reporting
    print("Training Accuracy:", history.history['accuracy'][-1])
    print("Validation Accuracy:", history.history['val_accuracy'][-1])

    # Evaluation
    print("Training accuracies:")
    print(history.history["accuracy"][-1])

    # MLP Model
    mlp_model = Sequential([
        Flatten(input_shape=input_shape),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(len(label_map), activation='softmax')
    ])

    mlp_model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    mlp_history = mlp_model.fit(
        train_img, train_cancer_cat,
        validation_data=(test_img, test_cancer_cat),
        epochs=30
    )

    # Evaluation and reporting
    print("Training Accuracy:", mlp_history.history['accuracy'][-1])
    print("Validation Accuracy:", mlp_history.history['val_accuracy'][-1])

    # Evaluation
    print("Training accuracies:")
    print(mlp_history.history["accuracy"][-1])


In [4]:
import os
import subprocess
!find . -name ".DS_Store" -delete

# TODO: Need to make a for-loop that goes through every folder in images and annotations. This is 
# because the way the code works, it only goes through one folder at a time, so we need a loop
# that calls big_helper.py multiple times. I don't care about runtime anymore...

# NOTE: There will be Not-Found errors because not all the image files are downloaded even though all
# the annotations are downloaded.

# # Some of the stencil code uses depricated code, but it is still needed
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

# This code block basically preprocesses the image data and puts it into a CSV file
for folder in os.listdir("dl_data/annotations"):
    dicom_path = f"dl_data/images/Lung_Dx-{folder}"
    annotation_path = f"dl_data/annotations/{folder}" 

    # # Initial folder_list below
    # folder_list = ['A0001', 'A0002', 'A0003', 'A0004', 'A0005', 'B0004', 'B0005', 'E0001', 'E0002', 'G0004', 'G0005']
    
    # # Full folder_list below
    # folder_list = ['A0001', 'A0002', 'A0003', 'A0004', 'A0005', 'A0202', 'B0004', 'B0005', 'B0011', 'B0012'
    #                , 'B0013', 'B0014', 'B0015', 'B0016', 'B0017', 'B0018', 'B0019', 'B0020', 'B0021', 'B0022'
    #                 , 'B0023', 'B0024', 'B0025', 'E0001', 'E0002', 'E0003', 'E0004', 'E0005', 'G0004', 'G0005'
    #                 , 'G0010', 'G0011', 'G0012', 'G0013', 'G0014', 'G0015', 'G0016', 'G0017', 'G0018', 'G0019'
    #                 , 'G0020']

    # ~600mb folder_list
    folder_list = ['A0001', 'A0002', 'A0003', 'A0004', 'A0005', 'B0004', 'B0005', 'B0011', 'B0012'
                   'E0001', 'E0002', 'E0003', 'E0004', 'G0004', 'G0005'
                    , 'G0010', 'G0011', 'G0012']
    
    if folder not in folder_list:
        continue
    else:
        print(folder)
    
    command = [
        "python", "preprocess.py", "--dicom-mode", "CT",
        "--dicom-path", dicom_path,
        "--annotation-path", annotation_path,
        "--classfile", "category.txt"
    ]
    
    # Execute the command
    subprocess.run(command)

A0001
Folder/File Found
G0012
Folder/File Found
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
G0004
Folder/File Found
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key error
Possible key err

In [7]:
data = pd.read_csv('output.csv')
make_model(data['pixel_data'], data['cancer_type'])



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.9877049326896667
Validation Accuracy: 0.9836065769195557
Training accuracies:
0.9877049326896667
