In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from google.colab import drive
import os

# Download the data from Kaggle directly

In [None]:
# --- 1. Automated Data Loading ---
print("Downloading and loading Malaria dataset... (approx 300MB)")
# This downloads the official NIH Malaria dataset automatically
dataset, info = tfds.load('malaria', split='train', as_supervised=True, with_info=True)


# Visualize Raw Data

In [None]:
print("Loading Data...")
dataset, info = tfds.load('malaria', split='train', as_supervised=True, with_info=True)

In [None]:
# Helper function to get class names (0: Parasitized, 1: Uninfected)
get_label_name = info.features['label'].int2str

In [None]:
# --- VISUALIZATION 1: Raw Data (Before Preprocessing) ---
print("\n--- Visualizing Raw Data ---")
fig = tfds.show_examples(dataset, info)
plt.show()

# Train - Test Split

In [None]:
# Define splitting (80% Train, 20% Validation)
# The dataset comes as one chunk, so we split manually
data_size = info.splits['train'].num_examples
train_size = int(0.8 * data_size)

In [None]:
train_ds = dataset.take(train_size)
val_ds = dataset.skip(train_size)

# Data Preprocessing

In [None]:
# --- 2. Advanced Preprocessing Pipeline ---
def preprocess(image, label):
    image = tf.image.resize(image, (128, 128)) # Standardize size
    image = image / 255.0 # Normalize to [0,1]
    return image, label

In [None]:
# Optimize for performance (Data Pipeline)
BATCH_SIZE = 32
train_ds = train_ds.map(preprocess).cache().shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(preprocess).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Visualize Pre-procesed data

In [None]:
# --- VISUALIZATION 2: Preprocessed Data (What the AI sees) ---
print("\n--- Visualizing Preprocessed Data (128x128) ---")
# Take one batch from the training set
image_batch, label_batch = next(iter(train_ds))

plt.figure(figsize=(10, 10))
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(image_batch[i])
    label = label_batch[i]
    plt.title(f"{get_label_name(label)} (Norm)")
    plt.axis("off")
plt.show()

# Create CNN Architecture

In [None]:
# --- 3. "Bio-Medical" CNN Architecture ---
model = models.Sequential([
    # Input Layer
    layers.InputLayer(input_shape=(128, 128, 3)),

    # Data Augmentation (Makes the model robust - Examiners love this)
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.2),

    # Block 1
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),
    
    # Block 2 (Deeper features)
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),

    # Block 3
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),

    # Classification Head
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5), # Regularization
    layers.Dense(1, activation='sigmoid') # Binary Classification (Parasitized vs Uninfected)
])

# Compile model and Summary

In [None]:
# --- 4. Compile & Train ---
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

# Train CNN

In [None]:
print("Training Medical AI Model...")
history = model.fit(train_ds, validation_data=val_ds, epochs=6) # 6 epochs is enough for >90%

# Model Evaluation - Accuracy/Loss Graphs, Confusion Matrix, Heatmap, Classification Report

In [None]:
# --- 3. Visualization: Training vs Testing Graphs ---
print("\n--- Generating Performance Graphs ---")
plt.figure(figsize=(12, 5))

In [None]:
# Accuracy Plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Acc')
plt.plot(history.history['val_accuracy'], label='Validation Acc')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)

In [None]:
# Loss Plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- 4. Advanced Evaluation: Confusion Matrix & F1 Score ---
print("\n--- Calculating Advanced Metrics (F1, Confusion Matrix) ---")

# Get true labels and predicted probabilities
y_true = []
y_pred_probs = []

# Iterate over validation dataset to extract labels
for images, labels in val_ds:
    y_true.extend(labels.numpy())
    probs = model.predict(images, verbose=0)
    y_pred_probs.extend(probs.flatten())

y_true = np.array(y_true)
y_pred_probs = np.array(y_pred_probs)
# Convert probabilities to binary class (0 or 1)
y_pred = (y_pred_probs > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

In [None]:
# Plot Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Parasitized', 'Uninfected'], yticklabels=['Parasitized', 'Uninfected'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Confusion Matrix (F1 Score: {f1:.2f})')
plt.show()

In [None]:
print("\n--- Detailed Classification Report ---")
print(classification_report(y_true, y_pred, target_names=['Parasitized', 'Uninfected']))

# Save Model

In [None]:
# --- 5. Save Model to Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# Create directory if it doesn't exist
save_path = '/content/drive/My Drive/Colab_Models/Malaria_CNN'
if not os.path.exists(save_path):
    os.makedirs(save_path)

model_file = os.path.join(save_path, 'malaria_cnn.h5')
model.save(model_file)
print(f"Model saved successfully at: {model_file}")

# Test the model after loading it from the drive

In [None]:
# --- 6. Load and Test Model ---
# Load the saved model
loaded_model = tf.keras.models.load_model(model_file)

# Test on a small batch from validation set
test_images, test_labels = next(iter(val_ds.take(1)))
predictions = loaded_model.predict(test_images)
predicted_classes = (predictions > 0.5).astype(int).flatten()

print("\n--- Model Testing Results ---")
for i in range(min(5, len(test_labels))):
    true_label = get_label_name(test_labels[i])
    pred_label = 'Parasitized' if predicted_classes[i] == 0 else 'Uninfected'
    confidence = predictions[i][0] if predicted_classes[i] == 1 else 1 - predictions[i][0]
    print(f"Image {i+1}: True: {true_label}, Predicted: {pred_label}, Confidence: {confidence:.2f}")
