-------------------------------------------------------
- Wiley Winters
- MSDS 686 Deep Learning
- Week 7-8 Kaggle Project&nbsp;&mdash;&nbsp;Brain Tumor Classification
- 2025-MAR-
--------------------------------------------------------

### Requirements

----------------------------------------------
**Required for 80%**</p>
Complete project on *kaggle.com* using the skills learned in the <u>Deep Learning</u> class.  The following are required:
- Show/plot sample images or data with labels
- Include at least on of the following
  - Convolution
  - Max Pooling
  - Batch Normalization
  - Dropout
  - LSTM
  - TF-IDf
- Use validation data
- Evaluate model on test data

-------------------------------------------
**Additional for another 20%**</p>
- Use data augmentation
- Use at least one of the following:
  - Kernels
  - Activation functions
  - Loss functions
  - Libraries
  - Methods
- Learning rate optimization
- Functional API model
- Transfer learning with or without trainable parameters
- Confusion matrix and / or ROC plots
- Plots of accuracy/loss vs epochs
- Show/plot sample incorrect prediction with labels and correct label

----------------------------------------------------------------
### Load Libraries and Packages

In [None]:
import os, random, pathlib, copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

# Import openCV and Pillow APIs
import cv2 as cv
from PIL import Image

# Libraries for general machine learning tasks and measuring performance
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# PyTorch API
import torch, torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Module, Conv2d, Linear, MaxPool2d, ReLU
from torch.nn import LogSoftmax
from torch import flatten
from torch import optim
from torchvision.transforms import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.autograd import Variable
from torchsummary import summary

# Print status bars
from tqdm.notebook import trange, tqdm

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Make plots have guidelines
plt.style.use('ggplot')

# Display plots inline
%matplotlib inline

**Set Random Seed for Reproducibility**

In [None]:
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

**Declare Global Variables**

In [None]:
# Image Directories
home_dir = '/home/wiley/regis/dataScience'
trn_dir = home_dir+'/msds686/week7/kaggleProject/images/data/training'
tst_dir = home_dir+'/msds686/week7/kaggleProject/images/data/testing'
val_dir = home_dir+'/msds686/week7/kaggleProject/images/data/validation'

# Create path objects
home_dir = pathlib.Path(home_dir)
trn_dir = pathlib.Path(tst_dir)
val_dir = pathlib.Path(val_dir)

# Classes
classes = ['negative', 'positive']
num_classes = len(classes)

# Image size and shape
img_size = (256, 256)
img_shape = (256, 256, 3)

# Make sure pyTorch uses GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Define Functions**

In [None]:
# Plot performance Metrics
def plot_history(history):
    epochs = range(1, len(history.history['accuracy']) + 1)
    plt.figure(figsize=(20,12))

    plt.subplot(2,2,1)
    plt.plot(epochs, history.history['loss'], 'b', label = 'Training Loss')
    plt.plot(epochs, history.history['val_loss'], 'r', label = 'Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2,2,2)
    plt.plot(epochs, history.history['accuracy'], 'b', label = 'Training Accuracy')
    plt.plot(epochs, history.history['val_accuracy'], 'r', label = 'Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.suptitle('Model Training Metrics over Epochs', fontsize=16)
    plt.show()

In [None]:
# Print results from test data
def print_test(model, test_data, test_labels):
    test_loss, test_acc = model.evaluate(test_data, test_labels)
    print('---------------------------------------------------')
    print('\033[1m'+'Test results:'+'\033[0m')
    print(f'Test loss:{round(test_loss,4)}')
    print(f'Test accuracy:{round(test_acc,4)}')
    print('---------------------------------------------------')

In [None]:
# Function to determine the output size of
# of a convolutional layer in a NN
def findConv2dOutShape(hin, win, conv, pool=2):
    # get conv arguments
    kernel_size = conv.kernel_size
    stride = conv.stride
    padding = conv.padding
    dilation = conv.dilation

    hout = np.floor((hin+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
    wout = np.floor((win+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)

    if pool:
        hout /= pool
        wout /= pool
    return int(hout), int(wout)

In [None]:
# Function to get the learning rate
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

In [None]:
# Function to compute the loss value per batch
def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    pred = output.argmax(dim=1, keepdim=True)
    metric_b = pred.eq(target.view_as(pred)).sum().item()
    
    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b

In [None]:
# Compute the loss value and performance per batch
def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    pred = output.argmax(dim=1, keepdim=True)
    metric_b = pred.eq(target.view_as(pred)).sum().item()
    
    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b

In [None]:
# Compute the loss and performance per epoch
def loss_epoch(model, loss_func, dataset_dl, opt=None):
    run_loss = 0.0 
    t_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)
        loss_b,metric_b = loss_batch(loss_func, output, yb, opt)
        run_loss += loss_b

        if metric_b is not None:
            t_metric += metric_b    
    
    loss = run_loss/float(len_data)
    metric = t_metric/float(len_data)
    
    return loss, metric

In [None]:
# Training and Evaluation Function
def train_val(model, params, verbose=False):
    epochs = params['epochs']
    loss_func = params['f_loss']
    opt = params['optimizer']
    train_dl = params['train']
    val_dl = params['val']
    lr_scheduler = params['lr_change']
    weight_path = params['weight_path']
    
    loss_history = {'train': [], 'val': []} 
    metric_history = {'train': [], 'val': []} 
    best_model_wts = copy.deepcopy(model.state_dict()) 
    best_loss = float('inf') 

    # Train Model n_epochs (the progress of training by printing the epoch number 
    # and the associated learning rate. 
    # It can be helpful for debugging, monitoring the learning rate schedule, 
    # or gaining insights into the training process.) 
    
    for epoch in tqdm(range(epochs)):
        
        # Get the Learning Rate
        current_lr = get_lr(opt)
        if(verbose):
            print('Epoch {}/{}, current lr={}'.format(epoch, epochs - 1, current_lr))
 
       # Train Model Process
        model.train()
        train_loss, train_metric = loss_epoch(model,loss_func,train_dl,opt)

        # collect losses
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)
        
        # Evaluate Model Process
        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model,loss_func,val_dl)
        
        # store best model
        if(val_loss < best_loss):
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            
            # store weights into a local file
            torch.save(model.state_dict(), weight_path)
            if(verbose):
                print('Copied best model weights!')
        
        # collect loss and metric for validation dataset
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)
        
        # learning rate schedule
        lr_scheduler.step(val_loss)
        if current_lr != get_lr(opt):
            if(verbose):
                print('Loading best model weights!')
            model.load_state_dict(best_model_wts) 

        if(verbose):
            print(f"train loss: {train_loss:.6f}, dev loss: {val_loss:.6f}, accuracy: {100*val_metric:.2f}")
            print("-"*10) 

    # load best model weights
    model.load_state_dict(best_model_wts)
        
    return model, loss_history, metric_history

### Define Binary Classifier
This model 

In [None]:
class cnn_tumor(nn.Module):
    
    # Network Initialization
    def __init__(self, params):
        super(cnn_tumor, self).__init__()
        Cin,Hin,Win = params['shape_in']
        init_f = params['initial_filters'] 
        num_fc1 = params['num_fc1']  
        num_classes = params['num_classes'] 
        self.dropout_rate = params['dropout_rate'] 
        
        # Convolution Layers
        self.conv1 = nn.Conv2d(Cin, init_f, kernel_size=3)
        h, w = findConv2dOutShape(Hin, Win, self.conv1)
        self.conv2 = nn.Conv2d(init_f, 2*init_f, kernel_size=3)
        h, w = findConv2dOutShape(h, w, self.conv2)
        self.conv3 = nn.Conv2d(2*init_f, 4*init_f, kernel_size=3)
        h, w = findConv2dOutShape(h, w, self.conv3)
        self.conv4 = nn.Conv2d(4*init_f, 8*init_f, kernel_size=3)
        h, w = findConv2dOutShape(h, w, self.conv4)
        
        # compute the flatten size
        self.num_flatten = h*w*8*init_f
        #self.fc1 = nn.Linear(self.num_flatten, num_fc1)
        self.fc1 = nn.Linear(12544, 64)
        self.fc2 = nn.Linear(num_fc1, num_classes)

    def forward(self,X):
        # Convolution & Pool Layers
        X = F.relu(self.conv1(X)); 
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv2(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv3(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv4(X))
        X = F.max_pool2d(X, 2, 2)
        #X = X.view(-1, self.num_flatten)
        X = X.view(X.size(0), -1)
        X = F.relu(self.fc1(X))
        X = F.dropout(X, self.dropout_rate)
        X = self.fc2(X)
        return F.log_softmax(X, dim=1)

### Load Data
The method used to load paths and classes into the dataframes will go from director to directory.  In other words, there will be artificial groupings of the different brain tumor classes.  I added statements to shuffle the values in the dataframes.

In [None]:
# Load training data into a pandas dataframe for EDA
labels, paths = zip(*[(label, os.path.join(trn_dir, label, image))
                       for label in os.listdir(trn_dir)
                       if os.path.isdir(os.path.join(trn_dir, label))
                       for image in os.listdir(os.path.join(trn_dir, label))])

trn_df = pd.DataFrame({'paths': paths, 'labels': labels})

# Load testing data into a pandas dataframe for EDA
labels, paths = zip(*[(label, os.path.join(tst_dir, label, image))
                       for label in os.listdir(tst_dir)
                       if os.path.isdir(os.path.join(tst_dir, label))
                       for image in os.listdir(os.path.join(tst_dir, label))])

tst_df = pd.DataFrame({'paths': paths, 'labels': labels})

# Load validation data into pandas dataframe for EDA
labels, paths = zip(*[(label, os.path.join(val_dir, label, image))
                       for label in os.listdir(val_dir)
                       if os.path.isdir(os.path.join(val_dir, label))
                       for image in os.listdir(os.path.join(val_dir, label))])

val_df = pd.DataFrame({'paths': paths, 'labels': labels})

# Shuffle the training and testing dataframes
trn_df = trn_df.sample(frac=1, random_state=42).reset_index(drop=True)
tst_df = tst_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Take a look at the results
print('Training:\n', trn_df.head(10).to_markdown())
print('Testing:\n', tst_df.head(10).to_markdown())
print('Validation:\n', val_df.head(10).to_markdown())

### EDA

**Look at Training Images' Distribution**

In [None]:
print('-->Training Labels Value Counts:\n', trn_df['labels'].value_counts())
print('-->Testing Labels Value Counts:\n', tst_df['labels'].value_counts())
print('-->Validation Lables Value Counts:\n', val_df['labels'].value_counts())

In [None]:
plt.figure(figsize=(6,4))
trn_df['labels'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Training Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

The distribution is a little unbalanced and I may have to perform some mitigation efforts to fix this, but will continue with this analysis as is for now.

**Look at Testing Images' Distribution**

In [None]:
plt.figure(figsize=(6,4))
tst_df['labels'].value_counts().plot(kind='bar')
plt.title('Distribution of Image Counts in Testing Data')
plt.xlabel('Category')
plt.ylabel('Image Count')
plt.show()

Distribution mirrors what the *training data* shows, but with less frequency.

<span style='color:orange'>NOTE:</span>&nbsp;&nbsp;The classes are unbalance and will have to be handled before fitting the data to the model.  There are a few methods I can use to balance the classes:
- Resampling Techniques
  - Oversampling
  - Under-sampling
- Class Weighting
  - Weighted Loss Functions
  - Weighted Random Sampler
- Synthetic Data Generation
  - SMOTE (Synthetic Minority Over-sampling Technique)
  - GANs (Generative Adversarial Networks)

I can also use the ***F1 Score*** which is not influences as much by unbalanced classes as the accuracy score

**Examine Shape of Training and Testing DataFrames**

In [None]:
print('Training Shape: \n', trn_df.shape)
print('Testing Shape:  \n', tst_df.shape)
print('Validation Shape: \n', val_df.shape)

**NOTE:**&nbsp;&nbsp;Since the dataframes are built from the contents of the image directories, there should be no missing values or duplicates.

### Process Images

In [None]:
# Create transform object
transf = transforms.Compose([
    transforms.Resize((img_size)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.458, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define objects for training, testing, and validation datasets
trn_ds = torchvision.datasets.ImageFolder(trn_dir, transform=transf)
trn_ds.transform = transf

# Testing images should not be augmented
tst_ds = torchvision.datasets.ImageFolder(tst_dir, transform=transforms.ToTensor())
tst_ds.transform

val_ds = torchvision.datasets.ImageFolder(val_dir, transform=transf)
val_ds.transform = transf

### Visualize some Images

In [None]:
class_label = {0: 'positive', 1: 'negative'}
fig = plt.figure(figsize=(10,10))
cols, rows = 4, 4
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(trn_ds), size=(1,)).item()
    img, label = trn_ds[sample_idx]
    fig.add_subplot(rows, cols, i)
    plt.title(class_label[label])
    plt.axis('off')
    img_np = img.numpy().transpose((1,2,0))
    img_valid_rng = np.clip(img_np,0,1)
    plt.imshow(img_valid_rng)
    plt.suptitle('Images')

plt.show()

### Create DataLoaders for Training, Testing, and Validation Images

In [None]:
# Load training images
trn_loader = DataLoader(trn_ds, batch_size=64, shuffle=True, num_workers=2)

# Load testing images
tst_loader = DataLoader(tst_ds, batch_size=64, shuffle=True, num_workers=2)

# Load validation images
val_loader = DataLoader(val_ds, batch_size=64, shuffle=True, num_workers=2)

**Print Shape of Training and Validation DataSets**

In [None]:
for key, value in {'Training Data': trn_loader, 'Validation Data': val_loader}.items():
    for X, y in value:
        print(f'{key}:')
        print(f'Shape of X: {X.shape}')
        print(f'Shape of y: {y.shape} {y.dtype}')
        break

### Initialize CNN Class

In [None]:
model_params = {'shape_in': (3, 256, 246),
                'initial_filters': 8,
                #'num_fc1': 100,
                'num_fc1': 12544,
                'dropout_rate': 0.25,
                'num_classes': num_classes}

model_cnn = cnn_tumor(model_params)
model = model_cnn.to(device)

### Summarize Model

In [None]:
summary(model_cnn, input_size = (3, 256, 256), device = device.type)

### Define Loss Function and Optimizer
According to <u>*pyTorch*</u> documentation, using `nn.NLLLoss(reduction='sum')` should configure cross entropy loss or *logloss* as the loss function

In [None]:
# Configure loss function
loss_func = nn.NLLLoss(reduction='sum')

# Define optimizer
opt = optim.Adam(model_cnn.parameters(), lr = 0.0003)
lr_scheduler = ReduceLROnPlateau(opt, mode = 'min', factor = 0.5, patience = 20, verbose = 1)

### Model Training

In [None]:
# Configure model training parameters
trn_params = {
    'train': trn_loader, 'val': val_loader,
    'epochs': 100,
    'optimizer': opt,
    'lr_change': ReduceLROnPlateau(opt, mode = 'min', factor = 0.5, patience = 20,
                                   verbose=0),
    'f_loss': loss_func,
    'weight_path': 'weights.pt',
}

# Train and Validate Model
model_cnn, loss_hist, metric_hist = train_val(model_cnn, trn_params)