# Chest X-Ray Multi‑Class Project — Role Notebook

**Dataset:** Kaggle “Lungs Disease Dataset (4 types)” by Omkar Manohar Dalvi  
**Classes:** Normal, Bacterial Pneumonia, Viral Pneumonia, COVID‑19, Tuberculosis

> Use this notebook in **Google Colab**. If you’re running locally, adapt the Drive mount steps accordingly.

## Role — Member 4: Training & Evaluation

**Responsibilities**  
- Train models with callbacks (early stopping, LR schedule, checkpoints)  
- Track metrics & plots  
- Evaluate on test set; produce confusion matrix, classification report, ROC‑AUC (one‑vs‑rest)

## Environment & Paths

- The code below mounts Google Drive (for persistence) and prepares base paths.  
- Set `DATASET_DIR` to where the extracted dataset resides (after Kaggle download).

## Training Plan

1. Load the JSON architecture exported by Member 3  
2. (Re)build with same weights config  
3. Train with frozen backbone, then fine‑tune top layers  
4. Save the best model and training history

In [None]:
# === Colab & Paths ===
import os, sys, glob, json, random, shutil, time
from pathlib import Path

# If in Colab, mount Drive (safe to run elsewhere; it will just fail silently)
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    IN_COLAB = True
except Exception as e:
    print("Not running on Colab or Drive not available:", e)
    IN_COLAB = False

# Project root inside Drive (you can change this)
PROJECT_ROOT = Path('/content/drive/MyDrive/Chest_XRay_Project')
PROJECT_ROOT.mkdir(parents=True, exist_ok=True)

# Where the dataset will live (after download & unzip). Adjust as needed.
DATASET_DIR = PROJECT_ROOT / 'lungs_dataset'
OUTPUTS_DIR = PROJECT_ROOT / 'outputs'
MODELS_DIR = PROJECT_ROOT / 'models'
REPORTS_DIR = PROJECT_ROOT / 'reports'

for p in [OUTPUTS_DIR, MODELS_DIR, REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATASET_DIR :", DATASET_DIR)
print("OUTPUTS_DIR :", OUTPUTS_DIR)
print("MODELS_DIR  :", MODELS_DIR)
print("REPORTS_DIR :", REPORTS_DIR)

In [None]:
import tensorflow as tf, json, numpy as np, matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import itertools

# Load classes
with open(PROJECT_ROOT / 'classes.json') as f:
    CLASS_NAMES = json.load(f)
NUM_CLASSES = len(CLASS_NAMES)

IMG_SIZE = (224,224)
BATCH_SIZE = 32
SEED = 42

# Rebuild datasets (must match Member 2 preprocessing)
train_raw = tf.keras.utils.image_dataset_from_directory(
    str(DATASET_DIR / 'train'), label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, seed=SEED)
val_raw = tf.keras.utils.image_dataset_from_directory(
    str(DATASET_DIR / 'val'), label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)
test_raw = tf.keras.utils.image_dataset_from_directory(
    str(DATASET_DIR / 'test'), label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

normalizer = tf.keras.layers.Rescaling(1./255)

data_augment = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.05),
    tf.keras.layers.RandomZoom(0.05),
    tf.keras.layers.RandomContrast(0.1),
])

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_raw.map(lambda x,y: (data_augment(normalizer(x), training=True), y)).prefetch(AUTOTUNE)
val_ds   = val_raw.map(lambda x,y: (normalizer(x), y)).prefetch(AUTOTUNE)
test_ds  = test_raw.map(lambda x,y: (normalizer(x), y)).prefetch(AUTOTUNE)

# Load architecture and instantiate model (EfficientNetB0 backbone as default)
from tensorflow.keras import models
try:
    with open(MODELS_DIR / 'model_for_training.json') as f:
        model = models.model_from_json(f.read())
except Exception as e:
    print("Falling back to building a fresh EfficientNetB0:", e)
    base = tf.keras.applications.EfficientNetB0(include_top=False, input_shape=(224,224,3), weights='imagenet')
    base.trainable = False
    x = base.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)
    model = tf.keras.Model(inputs=base.input, outputs=out, name='EffB0_TL')

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss'),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.2, monitor='val_loss'),
    tf.keras.callbacks.ModelCheckpoint(filepath=str(MODELS_DIR / 'best_model.keras'),
                                       save_best_only=True, monitor='val_loss')
]

history = model.fit(train_ds, validation_data=val_ds, epochs=30, callbacks=callbacks)

# Plot training curves (matplotlib only, single figure)
plt.figure()
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy over epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

plt.figure()
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss over epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Evaluate on test set
y_true = []
y_prob = []
for x, y in test_ds:
    y_true.append(y.numpy())
    y_prob.append(model.predict(x, verbose=0))

y_true = np.vstack(y_true)
y_prob = np.vstack(y_prob)
y_pred = np.argmax(y_prob, axis=1)
y_true_idx = np.argmax(y_true, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true_idx, y_pred)
print("Classification Report:")
print(classification_report(y_true_idx, y_pred, target_names=CLASS_NAMES))

# ROC-AUC (one-vs-rest)
try:
    auc_macro = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
    print("Macro ROC-AUC:", auc_macro)
except Exception as e:
    print("ROC-AUC computation issue:", e)

# Save artifacts
np.save(OUTPUTS_DIR / 'cm.npy', cm)
with open(OUTPUTS_DIR / 'classification_report.txt', 'w') as f:
    f.write(classification_report(y_true_idx, y_pred, target_names=CLASS_NAMES))
with open(OUTPUTS_DIR / 'metrics.json', 'w') as f:
    json.dump({"roc_auc_macro": float(auc_macro) if 'auc_macro' in locals() else None}, f, indent=2)

print("Saved confusion matrix and metrics to", OUTPUTS_DIR)