-------------------------------------------------------
- Wiley Winters
- MSDS 686 Deep Learning
- Week 7-8 Kaggle Project&nbsp;&mdash;&nbsp;Brain Tumor Classification
- 2025-MAR-
--------------------------------------------------------

### Requirements

----------------------------------------------
**Required for 80%**</p>
Complete project on *kaggle.com* using the skills learned in the <u>Deep Learning</u> class.  The following are required:
- Show/plot sample images or data with labels
- Include at least on of the following
  - Convolution
  - Max Pooling
  - Batch Normalization
  - Dropout
  - LSTM
  - TF-IDf
- Use validation data
- Evaluate model on test data

-------------------------------------------
**Additional for another 20%**</p>
- Use data augmentation
- Use at least one of the following:
  - Kernels
  - Activation functions
  - Loss functions
  - Libraries
  - Methods
- Learning rate optimization
- Functional API model
- Transfer learning with or without trainable parameters
- Confusion matrix and / or ROC plots
- Plots of accuracy/loss vs epochs
- Show/plot sample incorrect prediction with labels and correct label

----------------------------------------------------------------
### Load Libraries and Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, logging, random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# TensorFlow likes to display a lot of debug information
# on my home system
# I will squash the messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorFlow').setLevel(logging.FATAL)

# Tensorflow and keras APIs for convoluted neural Networks (CNN)s
import tensorflow as tf
from tensorflow import keras
from keras.applications import Xception
from tensorflow.keras import backend
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Rescaling
from tensorflow.keras.layers import Conv2D, MaxPool2D, AveragePooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall

# Make plots have guidelines
plt.style.use('ggplot')

# Squash Python warnings
import warnings
warnings.filterwarnings('ignore')

**Set Random Seed for Reproducibility**

In [None]:
np.random.seed(42)
random.seed(42)

**Declare Global Variables**

In [None]:
# Define training and testing image directories
home_dir = '/home/wiley'
trn_dir = home_dir+'/regis/dataScience/msds686/week7/kaggleProject/images/data/training'
tst_dir = home_dir+'/regis/dataScience/msds686/week7/kaggleProject/images/data/testing'
val_dir = home_dir+'/regis/dataScience/msds686/week7/kaggleProcject/images/data/validation'

# Define classes
classes = ['negative', 'positive']

# Define early_stop callback
early_stop = EarlyStopping(monitor='val_accuracy', patience=3,
                           restore_best_weights=True)

# Image size and shape
img_size = (299, 299)
img_shape = (299, 299, 3)

# Number of classes
num_classes = 2

### Define Functions

In [None]:
# Plot performance Metrics
def plot_history(history):
    epochs = range(1, len(history.history['accuracy']) + 1)
    plt.figure(figsize=(20,12))

    plt.subplot(2,2,1)
    plt.plot(epochs, history.history['loss'], 'b', label = 'Training Loss')
    plt.plot(epochs, history.history['val_loss'], 'r', label = 'Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2,2,2)
    plt.plot(epochs, history.history['accuracy'], 'b', label = 'Training Accuracy')
    plt.plot(epochs, history.history['val_accuracy'], 'r', label = 'Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(2,2,3)
    plt.plot(epochs, history.history['precision'], 'b', label='Training Precision')
    plt.plot(epochs, history.history['val_precision'], 'r', label='Validation Precision')
    plt.title('Training and Validation Precision')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()

    plt.subplot(2,2,4)
    plt.plot(epochs, history.history['recall'], 'b', label='Training Recall')
    plt.plot(epochs, history.history['val_recall'], 'r', label='Validation Recall')
    plt.title('Training and Validation Recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()

    plt.suptitle('Model Training Metrics over Epochs', fontsize=16)
    plt.show()

In [None]:
# Print results from test data
def model_evaluate(model, ds):
    score = model.evaluate(ds, verbose=1)
    print('-' * 30)
    print('\033[1m'+'Test results:'+'\033[0m')
    print(f'Test Score: {score[0]:.4f}')
    print('-' * 30)

    return score

### Load Data
The method used to load paths and classes into the dataframes will go from director to directory.  In other words, there will be artificial groupings of the different brain tumor classes.  I added statements to shuffle the values in the dataframes.

In [None]:
# Load training data into a pandas dataframe for EDA
classes, paths = zip(*[(label, os.path.join(trn_dir, label, image))
                       for label in os.listdir(trn_dir)
                       if os.path.isdir(os.path.join(trn_dir, label))
                       for image in os.listdir(os.path.join(trn_dir, label))])

trn_df = pd.DataFrame({'paths': paths, 'classes': classes})

# Load testing data into a pandas dataframe for EDA
classes, paths = zip(*[(label, os.path.join(tst_dir, label, image))
                       for label in os.listdir(tst_dir)
                       if os.path.isdir(os.path.join(tst_dir, label))
                       for image in os.listdir(os.path.join(tst_dir, label))])

tst_df = pd.DataFrame({'paths': paths, 'classes': classes})

# Shuffle the training and testing dataframes
trn_df = trn_df.sample(frac=1, random_state=42).reset_index(drop=True)
tst_df = tst_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Take a look at the results
print('Training:   \n', trn_df.head(10).to_markdown())
print('Testing:    \n', tst_df.head(10).to_markdown())

### EDA

**Look at Training Images' Distribution**

In [None]:
plt.figure(figsize=(6,4))
trn_df['classes'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Training Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

The distribution is what I would expect in the real world where the majority class would be tumor free.

**Look at Testing Images' Distribution**

In [None]:
plt.figure(figsize=(6,4))
tst_df['classes'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Testing Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

Distribution mirrors what the *training data* shows, but with less frequency.

**Examine Shape of Training and Testing DataFrames**

In [None]:
print('Training Shape: \n', trn_df.shape)
print('Testing Shape:  \n', tst_df.shape)

**NOTE:**&nbsp;&nbsp;Since the dataframes are built from the contents of the image directories, there should be no missing values or duplicates.

### Data Wrangling

**Create a Validation Subset from Training Data**

In [None]:
val_df, trn_df = train_test_split(trn_df, train_size=0.2, random_state=42,
                                  stratify=trn_df['classes'])
val_df.sample(10)

### Process Images from DataFrames
I am not sure if I have enough images to effectively train my CNN model.  I am leery of rotating or flipping images during the `ImageDataGenerator()` process; therefor, I will only adjust their brightness.

In [None]:
bs = 32

gen = ImageDataGenerator(rescale=1/255, brightness_range=(0.5, 1.5))

tst_gen = ImageDataGenerator(rescale=1/255)

trn_gen = gen.flow_from_dataframe(trn_df, x_col='paths', y_col='classes',
                                  batch_size=bs, target_size=img_size,
                                  shuffle=True)

val_gen = gen.flow_from_dataframe(val_df, x_col='paths', y_col='classes',
                                  batch_size=bs, target_size=img_size,
                                  shuffle=True)

tst_gen = tst_gen.flow_from_dataframe(tst_df, x_col='paths', y_col='classes',
                                      batch_size=16, target_size=img_size,
                                      shuffle=False)

### Examine a few Images and their Labels

In [None]:
dict = trn_gen.class_indices
classes = list(dict.keys())
images, labels = next(tst_gen)

plt.figure(figsize=(20,20))
for i, (image, label) in enumerate(zip(images, labels)):
    plt.subplot(4,4,i+1)
    plt.imshow(image)
    class_name = classes[np.argmax(label)]
    plt.title(class_name, color='k', fontsize=15)

plt.show()

### Baseline Model

In [None]:
# Create CNN model
backend.clear_session()

inputs  = Input(shape=(img_shape))
rescale = Rescaling(1./255)(inputs)

# Conv Layer 1
conv1   = Conv2D(filters=28, kernel_size=5, padding='same',
                 activation='relu')(rescale)
pool1   = MaxPool2D()(conv1)

# Conv Layer 2
conv2   = Conv2D(filters=56, kernel_size=5, padding='same',
                 activation='relu')(pool1)
pool2   = MaxPool2D()(conv2)

# Conv Layer 3
conv3   = Conv2D(filters=128, kernel_size=5, padding='same',
                 activation='relu')(pool2)
pool3   = MaxPool2D()(conv3)

# Conv Layer 4
conv4   = Conv2D(filters=256, kernel_size=5, padding='same',
                 activation='relu')(pool3)
pool4   = MaxPool2D()(conv4)

# Apply Batch Normalization, Flatten, and Dense Layers
batch3  = BatchNormalization()(pool4)
flatten = Flatten()(batch3)
dense   = Dense(512, activation='relu')(flatten)

# Pull the model together
preds   = Dense(num_classes, activation='softmax')(dense)

model_base = Model(inputs, preds)

# Compile base model
model_base.compile(optimizer='Adam', loss='categorical_crossentropy',
                   metrics=['accuracy', Precision(), Recall()])

# Print summary of model
model_base.summary()

# Plot model
plot_model(model_base, show_shapes=True)

# Fit data to model
hist_base = model_base.fit(trn_gen, epochs=50, batch_size=128,
                           validation_data=val_gen,
                           callbacks=[early_stop])

# Plot training results
plot_history(hist_base)

# Evaluate test data
model_evaluate(model_base, tst_gen)