# Classify Images of Cancer


In [16]:
!pip install keras-tuner

import pandas as pd
import os

# suppress informational messages from Tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# ML 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

# Modelling
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from kerastuner.tuners import RandomSearch


Collecting keras-tuner
  Using cached keras_tuner-1.3.5-py3-none-any.whl (176 kB)
Collecting kt-legacy
  Using cached kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


  from kerastuner.tuners import RandomSearch


In [6]:
# Define the base directory for the project
base_dir = '../Image_classification_data'

# Define the file paths for the CSV files
main_data_csv = os.path.join(base_dir, 'data_labels_mainData.csv')
extra_data_csv = os.path.join(base_dir, 'data_labels_extraData.csv')

# Read the main data CSV file
main_data = pd.read_csv(main_data_csv)

# Read the extra data CSV file
extra_data = pd.read_csv(extra_data_csv)

# Define the patch images folder
patch_images_folder = os.path.join(base_dir, 'patch_images')

# Map the ImageName column to the image file paths in the 'patch_images' folder
main_data['ImageFilePath'] = main_data['ImageName'].apply(lambda x: os.path.join(patch_images_folder, x))
extra_data['ImageFilePath'] = extra_data['ImageName'].apply(lambda x: os.path.join(patch_images_folder, x))

# # Display the main data DataFrame
# print(main_data.head())

# # Display the extra data DataFrame
# print(extra_data.head())

# the first 3 rows of the main data DataFrame
main_data.head(3)

Unnamed: 0,InstanceID,patientID,ImageName,cellTypeName,cellType,isCancerous,ImageFilePath
0,22405,1,22405.png,fibroblast,0,0,../Image_classification_data/patch_images/2240...
1,22406,1,22406.png,fibroblast,0,0,../Image_classification_data/patch_images/2240...
2,22407,1,22407.png,fibroblast,0,0,../Image_classification_data/patch_images/2240...


+ splits the dataset into training, validation, and test sets using the patientID information. 
+ defines a function load_and_preprocess_images to load and normalize pixel values. 
+ creates an ImageDataGenerator for data augmentation.
+ one-hot encodes the cellTypeName and isCancerous labels using the OneHotEncoder from Scikit-learn.

In [7]:
# Split the dataset into training, validation, and test sets
train_data, test_data = train_test_split(main_data, test_size=0.2, stratify=main_data['patientID'], random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, stratify=train_data['patientID'], random_state=42)

# Load images and normalize pixel values
def load_and_preprocess_images(df):
    images = []
    for image_path in df['ImageFilePath']:
        image = load_img(image_path, target_size=(27, 27))
        image = img_to_array(image)
        image = image / 255.0
        images.append(image)
    return np.array(images)

train_images = load_and_preprocess_images(train_data)
val_images = load_and_preprocess_images(val_data)
test_images = load_and_preprocess_images(test_data)

# Perform data augmentation
data_gen = ImageDataGenerator(rotation_range=20,
                              width_shift_range=0.2,
                              height_shift_range=0.2,
                              horizontal_flip=True,
                              vertical_flip=True)

# One-hot encode the cellTypeName and isCancerous labels
encoder = OneHotEncoder(sparse=False)

train_cell_type_labels = encoder.fit_transform(train_data['cellType'].values.reshape(-1, 1))
val_cell_type_labels = encoder.transform(val_data['cellType'].values.reshape(-1, 1))
test_cell_type_labels = encoder.transform(test_data['cellType'].values.reshape(-1, 1))

train_is_cancerous_labels = encoder.fit_transform(train_data['isCancerous'].values.reshape(-1, 1))
val_is_cancerous_labels = encoder.transform(val_data['isCancerous'].values.reshape(-1, 1))
test_is_cancerous_labels = encoder.transform(test_data['isCancerous'].values.reshape(-1, 1))


2023-04-24 15:05:46.052894: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


+ creates two separate models for isCancerous and cell-type using the preprocessed data. Both models use convolutional neural networks (CNNs) with a similar architecture. The models are compiled using the 'adam' optimizer and 'categorical_crossentropy' loss function.

+ The models are then trained using the preprocessed data and the data generator for data augmentation. 

+ The training is done for 50 epochs, and the results are stored in is_cancerous_history and cell_type_history variables.

In [10]:
# Model for isCancerous classification
def create_is_cancerous_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(27, 27, 3)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    return model

is_cancerous_model = create_is_cancerous_model()
is_cancerous_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model for cell-type classification
def create_cell_type_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(27, 27, 3)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='softmax'))
    return model

cell_type_model = create_cell_type_model()
cell_type_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the models using the preprocessed data
is_cancerous_history = is_cancerous_model.fit(data_gen.flow(train_images, train_is_cancerous_labels, batch_size=32),
                                              validation_data=(val_images, val_is_cancerous_labels),
                                              epochs=50, verbose=2)

cell_type_history = cell_type_model.fit(data_gen.flow(train_images, train_cell_type_labels, batch_size=32),
                                        validation_data=(val_images, val_cell_type_labels),
                                        epochs=50, verbose=2)


Epoch 1/50
186/186 - 4s - loss: 0.5412 - accuracy: 0.7145 - val_loss: 0.3884 - val_accuracy: 0.8504 - 4s/epoch - 20ms/step
Epoch 2/50
186/186 - 3s - loss: 0.3857 - accuracy: 0.8385 - val_loss: 0.3641 - val_accuracy: 0.8363 - 3s/epoch - 14ms/step
Epoch 3/50
186/186 - 3s - loss: 0.3719 - accuracy: 0.8466 - val_loss: 0.3180 - val_accuracy: 0.8711 - 3s/epoch - 14ms/step
Epoch 4/50
186/186 - 3s - loss: 0.3447 - accuracy: 0.8496 - val_loss: 0.2787 - val_accuracy: 0.8813 - 3s/epoch - 14ms/step
Epoch 5/50
186/186 - 3s - loss: 0.3459 - accuracy: 0.8557 - val_loss: 0.3243 - val_accuracy: 0.8580 - 3s/epoch - 14ms/step
Epoch 6/50
186/186 - 3s - loss: 0.3253 - accuracy: 0.8686 - val_loss: 0.3637 - val_accuracy: 0.8449 - 3s/epoch - 14ms/step
Epoch 7/50
186/186 - 3s - loss: 0.3287 - accuracy: 0.8543 - val_loss: 0.3005 - val_accuracy: 0.8807 - 3s/epoch - 14ms/step
Epoch 8/50
186/186 - 3s - loss: 0.3393 - accuracy: 0.8577 - val_loss: 0.3248 - val_accuracy: 0.8727 - 3s/epoch - 15ms/step
Epoch 9/50
186/1

+ evaluate the models using accuracy, precision, recall, and F1 score on the validation set.

+ perform hyperparameter tuning using Keras Tuner to optimize the models' performance.

In [17]:
# Function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Evaluate the models on the validation set
val_is_cancerous_pred = np.argmax(is_cancerous_model.predict(val_images), axis=-1)
val_cell_type_pred = np.argmax(cell_type_model.predict(val_images), axis=-1)

val_is_cancerous_true = np.argmax(val_is_cancerous_labels, axis=-1)
val_cell_type_true = np.argmax(val_cell_type_labels, axis=-1)

is_cancerous_metrics = calculate_metrics(val_is_cancerous_true, val_is_cancerous_pred)
cell_type_metrics = calculate_metrics(val_cell_type_true, val_cell_type_pred)

print("IsCancerous Model: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*is_cancerous_metrics))
print("Cell Type Model: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*cell_type_metrics))

# Perform hyperparameter tuning
def build_hypermodel(hp):
    model = Sequential()
    model.add(Conv2D(filters=hp.Int('filters_1', min_value=16, max_value=128, step=16),
                     kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
                     activation='relu',
                     input_shape=(27, 27, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=hp.Int('filters_2', min_value=16, max_value=128, step=16),
                     kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
                     activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=128, step=16), activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(2, activation='softmax'))  # Change this to 4 for cell type model
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_hypermodel,
    objective='val_accuracy',
    max_trials=30,
    executions_per_trial=2,
    directory='tuning',
    project_name='is_cancerous_hyperparameter_tuning')  # Change this to cell_type_hyperparameter_tuning for the cell type model

tuner.search(data_gen.flow(train_images, train_is_cancerous_labels, batch_size=32),  # Change to train_cell_type_labels for the cell type model
             epochs=50,
             validation_data=(val_images, val_is_cancerous_labels),  # Change to val_cell_type_labels for the cell type model
             verbose=2)

# Get the optimal hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build and train the model with the best hyperparameters
best_is_cancerous_model = tuner.hypermodel.build(best_hyperparameters)
best_is_cancerous_model.fit(data_gen.flow(train_images, train_is_cancerous_labels, batch_size=32),
                            validation_data=(val_images, val_is_cancerous_labels),
                            epochs=50, verbose=2)

# Evaluate the optimized model on the validation set
val_is_cancerous_pred = np.argmax(best_is_cancerous_model.predict(val_images), axis=-1)
is_cancerous_metrics = calculate_metrics(val_is_cancerous_true, val_is_cancerous_pred)

print("Optimized IsCancerous Model: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*is_cancerous_metrics))

# Repeat the same process for the cell-type classification model
tuner = RandomSearch(
    build_hypermodel,
    objective='val_accuracy',
    max_trials=30,
    executions_per_trial=2,
    directory='tuning',
    project_name='cell_type_hyperparameter_tuning')

tuner.search(data_gen.flow(train_images, train_cell_type_labels, batch_size=32),
                epochs=50,
                validation_data=(val_images, val_cell_type_labels),
                verbose=2)
# Get the optimal hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
# Build and train the model with the best hyperparameters
best_cell_type_model = tuner.hypermodel.build(best_hyperparameters)
best_cell_type_model.fit(data_gen.flow(train_images, train_cell_type_labels, batch_size=32),
                            validation_data=(val_images, val_cell_type_labels),
                            epochs=50, verbose=2)
# Evaluate the optimized model on the validation set
val_cell_type_pred = np.argmax(best_cell_type_model.predict(val_images), axis=-1)
cell_type_metrics = calculate_metrics(val_cell_type_true, val_cell_type_pred)

print(f'Optimized Cell Type Model: Accuracy: {cell_type_metrics[0]:.4f}, Precision: {cell_type_metrics[1]:.4f}, Recall: {cell_type_metrics[2]:.4f}, F1-score: {cell_type_metrics[3]:.4f}')




IsCancerous Model: Accuracy: 0.9035, Precision: 0.9037, Recall: 0.9035, F1-score: 0.9031
Cell Type Model: Accuracy: 0.7756, Precision: 0.7722, Recall: 0.7756, F1-score: 0.7676

Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
80                |80                |filters_1
5                 |5                 |kernel_size_1
48                |48                |filters_2
3                 |3                 |kernel_size_2
32                |32                |units
0                 |0                 |dropout_rate
0.00095119        |0.00095119        |learning_rate

Epoch 1/50
186/186 - 4s - loss: 0.5328 - accuracy: 0.7270 - val_loss: 0.4193 - val_accuracy: 0.7933 - 4s/epoch - 21ms/step
Epoch 2/50
186/186 - 3s - loss: 0.4081 - accuracy: 0.8154 - val_loss: 0.3130 - val_accuracy: 0.8661 - 3s/epoch - 15ms/step
Epoch 3/50
186/186 - 3s - loss: 0.3986 - accuracy: 0.8210 - val_loss: 0.4558 - val_accuracy: 0.8514 - 3s/epoch - 15ms/step
Epoch 4/50
186/186 - 3s - l