-------------------------------------------------------
- Wiley Winters
- MSDS 686 Deep Learning
- Week 7-8 Kaggle Project&nbsp;&mdash;&nbsp;Brain Tumor Classification
- 2025-MAR-
--------------------------------------------------------

## Requirements

----------------------------------------------
### Required for 80%
Complete project on *kaggle.com* using the skills learned in the <u>Deep Learning</u> class.  The following are required:
- Show/plot sample images or data with labels
- Include at least on of the following
  - Convolution
  - Max Pooling
  - Batch Normalization
  - Dropout
  - LSTM
  - TF-IDf
- Use validation data
- Evaluate model on test data

-------------------------------------------
### Additional for another 20%
- Use data augmentation
- Use at least one of the following:
  - Kernels
  - Activation functions
  - Loss functions
  - Libraries
  - Methods
- Learning rate optimization
- Functional API model
- Transfer learning with or without trainable parameters
- Confusion matrix and / or ROC plots
- Plots of accuracy/loss vs epochs
- Show/plot sample incorrect prediction with labels and correct label

----------------------------------------------------------------
## Load Libraries and Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import os, logging, random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# openCV2 image manipulation library
import cv2 as cv

# TensorFlow likes to display a lot of debug information
# on my home system
# I will squash the messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorFlow').setLevel(logging.FATAL)

# Tensorflow and keras APIs for convoluted neural Networks (CNN)s
import tensorflow as tf
from tensorflow import keras
from keras.applications import Xception
from tensorflow.keras import backend
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Rescaling
from tensorflow.keras.layers import Conv2D, MaxPool2D, AveragePooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import RMSprop, Adam

# Make plots have guidelines
plt.style.use('ggplot')

# Squash Python warnings
import warnings
warnings.filterwarnings('ignore')

### Set Random Seed for Reproducibility

In [None]:
np.random.seed(42)
random.seed(42)

### Declare Global Variables

In [None]:
# Define training and testing image directories
home_dir = '/home/wiley'
trn_dir = home_dir+'/regis/dataScience/kaggleProject/images/data/training'
tst_dir = home_dir+'/regis/dataScience/kaggleProject/images/data/testing'

#home_dir = '/disk01/e384698'
#trn_dir = home_dir+'/msds686/week7/images/data/training'
#tst_dir = home_dir+'/msds686/week7/images/data/testing'

# Define classes
labels = ['negative', 'positive']

# Define early_stop callback
early_stop = EarlyStopping(monitor='val_accuracy', patience=3,
                           restore_best_weights=True)

# Image size and shape
img_size = (224, 224)
img_shape = (224, 224, 3)

# Number of classes
num_classes = 2

## Define Functions

### Load DataFrames
- Join image filename with path information
- Create labels from class directory names
- Create dataframe
- Randomize dataframe rows

In [None]:
def load_dataframe(path):
    labels, paths = zip(*[(label, os.path.join(path, label, image))
                        for label in os.listdir(path)
                        if os.path.isdir(os.path.join(path, label))
                        for image in os.listdir(os.path.join(path, label))])

    df = pd.DataFrame({'paths': paths, 'labels': labels})
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

### Load Data and Perform Image Standardization

In [None]:
# Function to load data and resize images
def load_data(data_path, labels, img_size=224):
    X, y = [], []
    
    for label in labels:
        label_path = os.path.join(data_path, label)
        for img_name in os.listdir(label_path):
            img_path = os.path.join(label_path, img_name)
            img = cv.imread(img_path)
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            img = cv.resize(img, (img_size, img_size))
            X.append(img)
            y.append(labels.index(label))
    
    return np.array(X), np.array(y)

### Plot Performance Metrics
Plot the following:
- Training loss
- Validation loss
- Training Precision
- Validation Precision
- Training Recall
- Validation Recall

In [None]:
# Plot performance Metrics
def plot_history(history):
    epochs = range(1, len(history.history['accuracy']) + 1)
    plt.figure(figsize=(20,12))

    plt.subplot(2,2,1)
    plt.plot(epochs, history.history['loss'], 'b', label = 'Training Loss')
    plt.plot(epochs, history.history['val_loss'], 'r', label = 'Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2,2,2)
    plt.plot(epochs, history.history['accuracy'], 'b', label = 'Training Accuracy')
    plt.plot(epochs, history.history['val_accuracy'], 'r', label = 'Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(2,2,3)
    plt.plot(epochs, history.history['precision'], 'b', label='Training Precision')
    plt.plot(epochs, history.history['val_precision'], 'r', label='Validation Precision')
    plt.title('Training and Validation Precision')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()

    plt.subplot(2,2,4)
    plt.plot(epochs, history.history['recall'], 'b', label='Training Recall')
    plt.plot(epochs, history.history['val_recall'], 'r', label='Validation Recall')
    plt.title('Training and Validation Recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.legend()

    plt.suptitle('Model Training Metrics over Epochs', fontsize=16)
    plt.show()

### Display Raw Images

In [None]:
def visualize_images(path):
    num_images=5
    image_filenames = os.listdir(path)
    num_images = min(num_images, len(image_filenames))
    sample = random.sample(image_filenames, num_images)
    fig, ax = plt.subplots(1, num_images, figsize=(15,3), facecolor='grey')

    for i, image_filename in enumerate(sample[:num_images]):
        image_path = os.path.join(path, image_filename)
        image = mpimg.imread(image_path)

        ax[i].imshow(image)
        ax[i].axis('off')
        ax[i].set_title(image_filename)

plt.tight_layout()
plt.show()

### Score Model's Performance on Test Data
Will measure model performance on the following scores:
- Accuracy
- Precision
- Recall 

In [None]:
# Print results from test data
def model_evaluate(model, ds):
    score = model.evaluate(ds)
    print('-' * 30)
    print('\033[1m'+'Test results:'+'\033[0m')
    print(f'Test Accuracy Score: {score[0]:.4f}')
    print('-' * 30)

    return score

## Load Data
I will initially load the image paths and labels into panda DataFrames for EDA and analysis

In [None]:
# Load training image information into a dataframe
trn_df = load_dataframe(trn_dir)

# Load testing image information into a dataframe
tst_df = load_dataframe(tst_dir)

# Take a look at the results
print('-->Training DataFrame:\n', trn_df.head(10).to_markdown())
print('-->Testing DataFrame:\n', tst_df.head(10).to_markdown())

## EDA

### Look at Training Images' Distribution

In [None]:
plt.figure(figsize=(6,4))
trn_df['labels'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Training Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

Classes are imbalanced.

### Look at Testing Images' Distribution

In [None]:
plt.figure(figsize=(6,4))
tst_df['labels'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Testing Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

Distribution mirrors what the *training data* shows, but with less frequency.

<b><span style='color:red'>NOTE:</span></b>&nbsp;Classes are imbalance and may cause the model to favor the majority class over the minority one. This will have to be handled.  Some options include:
- Use a different evaluate metric such as **F1 Score**
- Resampling such as over or under sampling
- For image data I can use augmentation to attempt to balance the classes
- Tell model to give more weight to the minority class through a loss function

### Examine Shape of Training and Testing DataFrames

In [None]:
print('Training Shape:\n', trn_df.shape)
print('Testing Shape:\n', tst_df.shape)

<b><span style='color:red'>NOTE:</span></b>&nbsp;Since the dataframes are built from the contents of the image directories, there should be no missing values or duplicates.

### View a Few Images
<b><span style='color:orange'>TO DO</span></b>

In [None]:
visualize_images(trn_dir+'/positive')

### Data Wrangling

### Load Image and Label information into tensors
I will use the standard *X_train*, *X_test*, *y_train*, and *y_test* for the variable names.  I see this convention quite often when researching AI/ML topics and it was presented this way in ***MSDS 680 Machine Learning***.  This maps out to the following:
- X_train&nbsp;&mdash;&nbsp;Training Data
- y_train&nbsp;&mdash;&nbsp;Training Labels
- X_test&nbsp;&mdash;&nbsp;Testing Data
- y_test&nbsp;&mdash;&nbsp;Testing Labels

In [None]:
# Load data into np arrays and standardize image size and color depth
X_train, y_train = load_data(trn_dir, labels)
X_test, y_test = load_data(tst_dir, labels)

# Normalize pixel data
X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

# Apply one-hot encodeing to labels
y_train, y_test = to_categorical(y_train, num_classes), to_categorical(y_test, num_classes)

# Subset validation data from training dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### Data Augmentation
The number of images in this dataset is relatively small; therefore, I will apply image augmentation to the training set

In [None]:
gen = ImageDataGenerator(
      rotation_range=20,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      brightness_range=(0.5, 1.5))

# Only have to apply augmentation to training data
gen.fit(X_train)

### Model Architecture
This model is a Convolutional Neural Network (CNN) designed to classify images into two categories (*negative* or *positive*).  It consists of three convolution layers separated by pooling.  After the convolution layers perform their tasks, the results are passed to two dense layers for final classification

In [None]:
# Create CNN model
backend.clear_session()

inputs  = Input(shape=(img_shape))

# Conv Layer 1
conv1   = Conv2D(filters=28, kernel_size=5, padding='same',
                 activation='relu')(inputs)
pool1   = MaxPool2D()(conv1)

# Conv Layer 2
conv2   = Conv2D(filters=56, kernel_size=5, padding='same',
                 activation='relu')(pool1)
pool2   = MaxPool2D()(conv2)

# Conv Layer 3
conv3   = Conv2D(filters=128, kernel_size=5, padding='same',
                 activation='relu')(pool2)
pool3   = MaxPool2D()(conv3)

# Apply Batch Normalization, Flatten, and Dense Layers
batch3  = BatchNormalization()(pool3)
flatten = Flatten()(batch3)
dense   = Dense(128, activation='relu')(flatten)
dropout = Dropout(0,5)(dense)
dense1   = Dense(512, activation='relu')(dropout)

# Pull the model together
preds   = Dense(num_classes, activation='softmax')(dense1)

model_base = Model(inputs, preds)

# Compile base model
model_base.compile(optimizer='Adam', loss='categorical_crossentropy',
                   metrics=['accuracy', Precision(), Recall()])

# Print summary of model
model_base.summary()

# Plot model
plot_model(model_base, show_shapes=True)

In [None]:
# Extract training and validation datasets from tensors
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(8)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(8)

# Fit datasets to model
hist_base = model_base.fit(train_ds, epochs=50, batch_size=128,
                           validation_data=val_ds,
                           callbacks=[early_stop])

In [None]:
# Plot training results
plot_history(hist_base)

# Evaluate test data
#model_evaluate(model_base, tst_gen)

In [None]:
test_loss, test_acc = model_base.evaluate(X_test, y_test)
print('-->Test Loss:     ', test_loss)
print('-->Test Accuracy: ', test_acc)