# Import Libraries

In [None]:
# Standard libraries
import os  # OS operations
import shutil  # File operations

# Data handling
import pandas as pd  # DataFrames
import numpy as np  # Numerical ops

# Visualization
import matplotlib.pyplot as plt  # Plots
import seaborn as sns  # Statistical plots

# KaggleHub for downloading datasets/models
import kagglehub

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Computer vision
import cv2  # Image processing

# Scikit-learn utilities and metrics
from sklearn.utils import compute_class_weight
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score

# TensorFlow & Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Conv2D
from tensorflow.keras.layers import Activation, MaxPooling2D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

# Load Dataset

In [None]:
data = "/kaggle/input/brain-tumor-mri-dataset"

In [None]:
# Function to count images in each directory
def count_images_in_dirs(base_dir):
    dir_counts = {}
    for root, _, files in os.walk(base_dir):
        # Filter out non-image files if necessary, here we assume all are images
        image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
        if image_files: # Only add directory if it contains images
            dir_counts[root] = len(image_files)
    return dir_counts

count_images_in_dirs(data)

# Data spliting to (Train - Validation - Test)

In [None]:
# Input and Output Paths
base_dir = '/content/'

train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')
test_dir = os.path.join(base_dir, 'test')

# Create output directories
for split_dir in [train_dir, validation_dir, test_dir]:
    os.makedirs(split_dir, exist_ok=True)

# Categories from your dataset
categories = ['pituitary', 'notumor', 'meningioma', 'glioma']

# Split ratios
train_ratio = 0.8
validation_ratio = 0.2
test_ratio = 1 # This is not used in the current split logic, as train and val are taken from "Training" and test from "Testing"

# Split and copy
for category in categories:
    train_category_path = os.path.join(data, 'Training', category)
    test_category_path = os.path.join(data, 'Testing', category)

    # Process training and validation data
    if os.path.exists(train_category_path):
        image_files = os.listdir(train_category_path)
        np.random.shuffle(image_files)

        num_images = len(image_files)
        num_train = int(train_ratio * num_images)
        num_val = int(validation_ratio * num_images)

        train_files = image_files[:num_train]
        val_files = image_files[num_train:num_train + num_val]

        # Copy helper
        def copy_files(file_list, src_dir, dest_dir):
            dest_category_dir = os.path.join(dest_dir, category)
            os.makedirs(dest_category_dir, exist_ok=True)
            for file in file_list:
                src_path = os.path.join(src_dir, file)
                dst_path = os.path.join(dest_category_dir, file)
                shutil.copy(src_path, dst_path)

        copy_files(train_files, train_category_path, train_dir)
        copy_files(val_files, train_category_path, validation_dir)
    else:
        print(f"Training directory not found for category: {category}")

    # Process test data
    if os.path.exists(test_category_path):
        test_files = os.listdir(test_category_path)
        copy_files(test_files, test_category_path, test_dir)
    else:
        print(f"Testing directory not found for category: {category}")

print("Data split completed!")

# Count image in each directory

In [None]:
# Check image counts in train, validation, and test directories
print("Image counts in Train Directory:")
train_counts = count_images_in_dirs(train_dir)
for directory, count in train_counts.items():
    print(f"  {directory}: {count} images")

print("\nImage counts in Validation Directory:")
validation_counts = count_images_in_dirs(validation_dir)
for directory, count in validation_counts.items():
    print(f"  {directory}: {count} images")

print("\nImage counts in Test Directory:")
test_counts = count_images_in_dirs(test_dir)
for directory, count in test_counts.items():
    print(f"  {directory}: {count} images")

# Optional: Print total counts per split
total_train = sum(train_counts.values())
total_validation = sum(validation_counts.values())
total_test = sum(test_counts.values())

print(f"\nTotal images in train set: {total_train}")
print(f"Total images in validation set: {total_validation}")
print(f"Total images in test set: {total_test}")

# Display sample images

In [None]:
# Function to display sample images
def display_sample_images(directory, num_images=4):
    plt.figure(figsize=(12, 8))
    i = 0
    for category in categories:
        category_path = os.path.join(directory, category)
        image_files = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))][:num_images] # Take up to num_images

        for img_file in image_files:
            img_path = os.path.join(category_path, img_file)
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert BGR to RGB

            plt.subplot(len(categories), num_images, i + 1)
            plt.imshow(img)
            plt.title(f"{category}")
            plt.axis('off')
            i += 1
            if i >= len(categories) * num_images: # Stop if we have enough images
              break
        if i >= len(categories) * num_images: # Stop if we have enough images
          break

    plt.tight_layout()
    plt.show()

# Display sample images
display_sample_images(train_dir, num_images=4)

# Image Data Generator

In [None]:
# Image Data Generator
datagen = ImageDataGenerator(rescale=1./255)

# Create generators from the split directories
train_generator = datagen.flow_from_directory(
    train_dir,  # Use the training directory
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

validation_generator = datagen.flow_from_directory(
    validation_dir,  # Use the validation directory
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

test_generator = datagen.flow_from_directory(
    test_dir,  # Use the test directory
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical',
    shuffle=False # Keep order for evaluation metrics
)

# CNN Model

## Build Model

In [None]:
# Clear previous models
tf.keras.backend.clear_session()

# Build the improved CNN
model = Sequential([

    # Block 1
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D(2, 2),

    # Block 2
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    # Block 3
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    # Block 4
    Conv2D(256, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    # Flatten and Dense Layers
    Flatten(),
    Dense(256, activation='relu'),
    Dense(4, activation='softmax')  # 4 output classes
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Show model summary
model.summary()

## Callbacks

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

## Model Training

In [None]:
history = model.fit(
    train_generator,
    epochs=50,
    validation_data=validation_generator,
    callbacks=[early_stopping]
)

## Model Evaluation

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Loss: {loss:.4f}")

# Model Prediction and Testing

## Training and Vaildataion

In [None]:
fig = plt.figure(figsize=(14, 5))

# Plot training & validation accuracy values
fig.add_subplot(121)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
fig.add_subplot(122)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()

## Make Prediction

In [None]:
# Get true labels and predicted probabilities for the test set
y_true = test_generator.classes
y_pred_prob = model.predict(test_generator)

# Get predicted labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Get class names
class_names = list(test_generator.class_indices.keys())

## Classification Report

In [None]:
report = classification_report(y_true, y_pred, target_names=class_names)
print("Classification Report:\n", report)

## Confusion Matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

## ROC Curve

In [None]:
# Binarize the true labels for multi-class ROC
y_true_binarized = label_binarize(y_true, classes=np.arange(test_generator.num_classes))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(test_generator.num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true_binarized[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot the ROC curves
plt.figure(figsize=(10, 8))
for i in range(test_generator.num_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class')
plt.legend(loc="lower right")
plt.show()

## Precision-Recall Curve

In [None]:
# Binarize the true labels for multi-class Precision-Recall
y_true_binarized = label_binarize(y_true, classes=np.arange(test_generator.num_classes))

# Compute Precision-Recall curve and average precision for each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(test_generator.num_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_true_binarized[:, i], y_pred_prob[:, i])
    average_precision[i] = average_precision_score(y_true_binarized[:, i], y_pred_prob[:, i])

# Plot the Precision-Recall curves
plt.figure(figsize=(10, 8))
for i in range(test_generator.num_classes):
    plt.plot(recall[i], precision[i], label='Precision-Recall curve of class {0} (area = {1:0.2f})'.format(i, average_precision[i]))

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve for multi-class')
plt.legend(loc="lower left")
plt.show()

## Testing on one image from each class



In [None]:
# Function to load and preprocess a single image
def load_and_preprocess_image(img_path, target_size=(128, 128)):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    img = cv2.resize(img, target_size)  # Resize
    img = img / 255.0  # Rescale
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

# Get class names
class_names = list(test_generator.class_indices.keys())

# Test on one image from each class
plt.figure(figsize=(10, 10))
for i, category in enumerate(categories):
    category_path = os.path.join(test_dir, category)
    # Get the first image file in the category directory
    image_files = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]
    if image_files:
        img_file = image_files[0]
        img_path = os.path.join(category_path, img_file)

        # Load and preprocess the image
        processed_img = load_and_preprocess_image(img_path)

        # Make a prediction
        predictions = model.predict(processed_img)
        predicted_class_index = np.argmax(predictions)
        predicted_class_name = class_names[predicted_class_index]

        # Display the image and prediction
        plt.subplot(2, 2, i + 1)
        plt.imshow(load_and_preprocess_image(img_path, target_size=(128, 128))[0]) # Display the original image (not preprocessed for model)
        plt.title(f"Actual: {category}\nPredicted: {predicted_class_name}")
        plt.axis('off')
    else:
        print(f"No images found for category: {category} in the test set.")

plt.tight_layout()
plt.show()

# Save Model

In [None]:
model.save(f'brain_tumor_mri_classification_model_acc_{accuracy * 100:.2f}%.keras')