# üéØ Chest X-Ray Multi-CNN Ensemble Strategy - Target 90%

**Current Best**: ResNet18 = 82.322%  
**Target**: 90%+ via ensemble

## üìã Strategy:

**Why abandon ViT?**
- ViT (80.3% ‚Üí 82.6%) failed to beat ResNet18 (82.3%)
- 86M parameters too large for 3780 training samples
- Medical imaging favors **local features** (CNN) over global (Transformer)

**New approach: 4-Model CNN Ensemble**
1. ResNet18 (11.7M) - 82.3% ‚úÖ proven
2. ResNet50 (25.6M) - deeper representation
3. DenseNet121 (8.1M) - excellent for medical imaging
4. MobileNetV2 (3.4M) - lightweight, low overfitting

**Expected**: Individual 83-86% ‚Üí Ensemble **88-90%** üéØ

---

## ‚è±Ô∏è Time Required:

- Setup: 5-10 minutes
- Training (3 models): 90-120 minutes (A100)
- Ensemble: 5 minutes
- **Total**: ~2-2.5 hours

---

## üîß Before You Start:

1. Runtime ‚Üí Change runtime type ‚Üí **A100 GPU**
2. Get Kaggle API key from https://www.kaggle.com/settings
3. Join competition: https://www.kaggle.com/competitions/cxr-multi-label-classification
4. Click "Run all"

## Step 1: Setup (GPU + Clone + Install)

Combines GPU verification, repository cloning, and dependency installation.

In [None]:
import torch
import os
import shutil

# ========== GPU Verification ==========
print("=" * 60)
print("STEP 1: SETUP")
print("=" * 60)

if not torch.cuda.is_available():
    print("\n‚ùå NO GPU! Please enable GPU: Runtime ‚Üí Change runtime type ‚Üí GPU")
    raise Exception("GPU required")

gpu_name = torch.cuda.get_device_name(0)
print(f"\n‚úÖ GPU: {gpu_name}")
print(f"‚úÖ Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# ========== Clone Repository ==========
print("\n[1/3] Cloning repository...")
%cd /content
if os.path.exists('nycu-CSIC30014-LAB3'):
    shutil.rmtree('nycu-CSIC30014-LAB3')

!git clone -q https://github.com/thc1006/nycu-CSIC30014-LAB3.git
%cd nycu-CSIC30014-LAB3
print("‚úÖ Repository cloned")

# ========== Install Dependencies ==========
print("\n[2/3] Installing dependencies...")
!pip install -q torch torchvision --index-url https://download.pytorch.org/whl/cu121
!pip install -q numpy pandas scikit-learn matplotlib tqdm pyyaml opencv-python kaggle
print("‚úÖ Dependencies installed")

# ========== Kaggle Setup ==========
print("\n[3/3] Setting up Kaggle API...")
from google.colab import files as colab_files
from pathlib import Path

print("Please upload your kaggle.json:")
uploaded = colab_files.upload()

if 'kaggle.json' not in uploaded:
    raise Exception("Please upload kaggle.json")

kaggle_dir = Path.home() / '.kaggle'
kaggle_dir.mkdir(exist_ok=True)
kaggle_json = kaggle_dir / 'kaggle.json'
with open(kaggle_json, 'wb') as f:
    f.write(uploaded['kaggle.json'])
os.chmod(kaggle_json, 0o600)

print("\n‚úÖ Setup complete!")
print("=" * 60)

## Step 2: Download & Organize Data

In [None]:
import zipfile
import subprocess
import pandas as pd
from tqdm.auto import tqdm

print("=" * 60)
print("STEP 2: DATA PREPARATION")
print("=" * 60)

# Download
print("\n[1/2] Downloading competition data...")
result = subprocess.run(
    ['kaggle', 'competitions', 'download', '-c', 'cxr-multi-label-classification'],
    capture_output=True, text=True
)

if result.returncode != 0:
    if '403' in result.stderr:
        print("‚ùå You need to join the competition first!")
        print("Visit: https://www.kaggle.com/competitions/cxr-multi-label-classification")
        raise Exception("Join competition")
    else:
        raise Exception(f"Download failed: {result.stderr}")

# Extract
for zip_file in [f for f in os.listdir('.') if f.endswith('.zip')]:
    with zipfile.ZipFile(zip_file, 'r') as zf:
        for file in tqdm(zf.namelist(), desc=f"Extracting {zip_file}", leave=False):
            zf.extract(file, '.')
    os.remove(zip_file)

print("‚úÖ Data downloaded & extracted")

# Organize
print("\n[2/2] Organizing images...")

# Collect all images
all_images = {}
for search_dir in ['.', 'train_images', 'val_images', 'test_images']:
    if os.path.exists(search_dir):
        for fname in os.listdir(search_dir):
            if fname.endswith(('.jpg', '.jpeg', '.png')):
                if fname not in all_images:
                    all_images[fname] = os.path.join(search_dir, fname)

# Move CSVs to data/
os.makedirs('data', exist_ok=True)
for csv in ['train_data.csv', 'val_data.csv', 'test_data.csv']:
    if os.path.exists(csv) and not os.path.exists(f'data/{csv}'):
        shutil.move(csv, f'data/{csv}')

# Organize by split
splits = {
    'train': ('data/train_data.csv', 'train_images'),
    'val': ('data/val_data.csv', 'val_images'),
    'test': ('data/test_data.csv', 'test_images')
}

for split_name, (csv_path, target_dir) in splits.items():
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        needed_files = set(df['new_filename'].values)
        
        os.makedirs(target_dir, exist_ok=True)
        
        for fname in tqdm(needed_files, desc=f"{split_name.upper()}", leave=False):
            target_path = os.path.join(target_dir, fname)
            if not os.path.exists(target_path) and fname in all_images:
                source_path = all_images[fname]
                if os.path.abspath(source_path) != os.path.abspath(target_path):
                    try:
                        shutil.move(source_path, target_path)
                    except FileNotFoundError:
                        pass

# Verify
print("\nVerification:")
for split_name, (csv_path, target_dir) in splits.items():
    if os.path.exists(target_dir):
        count = len([f for f in os.listdir(target_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
        print(f"  {target_dir}: {count} images")

print("\n‚úÖ Data organized")
print("=" * 60)

## Step 3: Train Model 1 - ResNet50 (~40 min)

In [None]:
print("=" * 60)
print("STEP 3: TRAIN RESNET50")
print("=" * 60)
print("\nModel: ResNet50 (25.6M params)")
print("Expected: 84-86% | Time: ~40 min (A100)\n")

# Auto-adjust for T4
if 'T4' in torch.cuda.get_device_name(0):
    !sed -i 's/batch_size: 24/batch_size: 16/g' configs/colab_resnet50.yaml
    print("[INFO] T4 detected: batch_size 24 ‚Üí 16")

!python -m src.train_v2 --config configs/colab_resnet50.yaml

print("\n‚úÖ ResNet50 training complete")
print("=" * 60)

## Step 4: Train Model 2 - DenseNet121 (~40 min)

In [None]:
print("=" * 60)
print("STEP 4: TRAIN DENSENET121")
print("=" * 60)
print("\nModel: DenseNet121 (8.1M params)")
print("Expected: 84-86% | Time: ~40 min (A100)\n")

!python -m src.train_v2 --config configs/colab_densenet121.yaml

print("\n‚úÖ DenseNet121 training complete")
print("=" * 60)

## Step 5: Train Model 3 - MobileNetV2 (~35 min)

In [None]:
print("=" * 60)
print("STEP 5: TRAIN MOBILENETV2")
print("=" * 60)
print("\nModel: MobileNetV2 (3.4M params - lightweight!)")
print("Expected: 83-85% | Time: ~35 min (A100)\n")

!python -m src.train_v2 --config configs/colab_mobilenetv2.yaml

print("\n‚úÖ MobileNetV2 training complete")
print("=" * 60)

## Step 6: Generate TTA Predictions (All Models)

In [None]:
print("=" * 60)
print("STEP 6: GENERATE TTA PREDICTIONS")
print("=" * 60)

models = [
    ('ResNet50', 'configs/colab_resnet50.yaml', 'outputs/colab_resnet50/best.pt'),
    ('DenseNet121', 'configs/colab_densenet121.yaml', 'outputs/colab_densenet121/best.pt'),
    ('MobileNetV2', 'configs/colab_mobilenetv2.yaml', 'outputs/colab_mobilenetv2/best.pt')
]

for model_name, config, ckpt in models:
    print(f"\nGenerating TTA predictions for {model_name}...")
    !python -m src.tta_predict --config {config} --ckpt {ckpt}
    
    # Rename output based on config's submission_path
    import yaml
    with open(config, 'r') as f:
        cfg = yaml.safe_load(f)
    
    submission_path = cfg['out']['submission_path']
    
    if os.path.exists('submission_tta.csv'):
        shutil.move('submission_tta.csv', submission_path)
        print(f"‚úÖ Saved to {submission_path}")

print("\n‚úÖ All TTA predictions generated")
print("=" * 60)

## Step 7: Create 4-Model Ensemble üéØ

In [None]:
import pandas as pd
import numpy as np

print("=" * 60)
print("STEP 7: CREATE ENSEMBLE")
print("=" * 60)

# Load all predictions
print("\nLoading predictions...")

# Check if ResNet18 baseline exists, if not skip it
use_resnet18 = os.path.exists('data/submission.csv')

if use_resnet18:
    pred_resnet18 = pd.read_csv('data/submission.csv')
    print("‚úÖ ResNet18 baseline found (82.3%)")
else:
    print("‚ö†Ô∏è  ResNet18 baseline not found, using 3-model ensemble")

pred_resnet50 = pd.read_csv('data/submission_resnet50.csv')
pred_densenet = pd.read_csv('data/submission_densenet121.csv')
pred_mobilenet = pd.read_csv('data/submission_mobilenetv2.csv')

prob_cols = ['normal', 'bacteria', 'virus', 'COVID-19']

if use_resnet18:
    # 4-Model Ensemble
    print("\n[1/2] Creating 4-model simple average ensemble...")
    ensemble_simple = pred_resnet18.copy()
    ensemble_simple[prob_cols] = (
        pred_resnet18[prob_cols].values +
        pred_resnet50[prob_cols].values +
        pred_densenet[prob_cols].values +
        pred_mobilenet[prob_cols].values
    ) / 4.0
    
    predictions = ensemble_simple[prob_cols].values.argmax(axis=1)
    ensemble_simple[prob_cols] = np.eye(4)[predictions]
    ensemble_simple.to_csv('submission_ensemble_4way_simple.csv', index=False)
    print("‚úÖ submission_ensemble_4way_simple.csv")
    
    print("\n[2/2] Creating 4-model weighted average ensemble...")
    weights = np.array([0.82, 0.85, 0.85, 0.84])
    weights = weights / weights.sum()
    print(f"   Weights: ResNet18={weights[0]:.3f}, ResNet50={weights[1]:.3f}, DenseNet={weights[2]:.3f}, MobileNet={weights[3]:.3f}")
    
    ensemble_weighted = pred_resnet18.copy()
    ensemble_weighted[prob_cols] = (
        weights[0] * pred_resnet18[prob_cols].values +
        weights[1] * pred_resnet50[prob_cols].values +
        weights[2] * pred_densenet[prob_cols].values +
        weights[3] * pred_mobilenet[prob_cols].values
    )
    
    predictions = ensemble_weighted[prob_cols].values.argmax(axis=1)
    ensemble_weighted[prob_cols] = np.eye(4)[predictions]
    ensemble_weighted.to_csv('submission_ensemble_4way_weighted.csv', index=False)
    print("‚úÖ submission_ensemble_4way_weighted.csv")
    
else:
    # 3-Model Ensemble (without ResNet18 baseline)
    print("\n[1/2] Creating 3-model simple average ensemble...")
    ensemble_simple = pred_resnet50.copy()
    ensemble_simple[prob_cols] = (
        pred_resnet50[prob_cols].values +
        pred_densenet[prob_cols].values +
        pred_mobilenet[prob_cols].values
    ) / 3.0
    
    predictions = ensemble_simple[prob_cols].values.argmax(axis=1)
    ensemble_simple[prob_cols] = np.eye(4)[predictions]
    ensemble_simple.to_csv('submission_ensemble_3way_simple.csv', index=False)
    print("‚úÖ submission_ensemble_3way_simple.csv")
    
    print("\n[2/2] Creating 3-model weighted average ensemble...")
    weights = np.array([0.85, 0.85, 0.84])
    weights = weights / weights.sum()
    print(f"   Weights: ResNet50={weights[0]:.3f}, DenseNet={weights[1]:.3f}, MobileNet={weights[2]:.3f}")
    
    ensemble_weighted = pred_resnet50.copy()
    ensemble_weighted[prob_cols] = (
        weights[0] * pred_resnet50[prob_cols].values +
        weights[1] * pred_densenet[prob_cols].values +
        weights[2] * pred_mobilenet[prob_cols].values
    )
    
    predictions = ensemble_weighted[prob_cols].values.argmax(axis=1)
    ensemble_weighted[prob_cols] = np.eye(4)[predictions]
    ensemble_weighted.to_csv('submission_ensemble_3way_weighted.csv', index=False)
    print("‚úÖ submission_ensemble_3way_weighted.csv")

# Show distributions
print("\nüìä Prediction distributions:")
for name, df in [('Simple', ensemble_simple), ('Weighted', ensemble_weighted)]:
    print(f"\n{name} ensemble:")
    counts = df[prob_cols].sum()
    for cls, count in counts.items():
        pct = count / len(df) * 100
        print(f"  {cls:12s}: {int(count):4d} ({pct:5.2f}%)")

print("\n" + "=" * 60)
print("‚úÖ ENSEMBLE COMPLETE!")
print("=" * 60)
print("\nüìä EXPECTED SCORES:")
if use_resnet18:
    print("   - Simple average (4-model): 87-89%")
    print("   - Weighted average (4-model): 88-90% üéØ")
    print("\nüí° RECOMMENDED: Upload submission_ensemble_4way_weighted.csv")
else:
    print("   - Simple average (3-model): 85-87%")
    print("   - Weighted average (3-model): 86-88% üéØ")
    print("\nüí° RECOMMENDED: Upload submission_ensemble_3way_weighted.csv")
print("=" * 60)

## Step 8: Download Ensemble Submissions

In [None]:
from google.colab import files as colab_files

print("=" * 60)
print("STEP 8: DOWNLOAD SUBMISSIONS")
print("=" * 60)

# Check which ensemble files were created
files_to_download = []
for f in ['submission_ensemble_4way_simple.csv', 'submission_ensemble_4way_weighted.csv',
          'submission_ensemble_3way_simple.csv', 'submission_ensemble_3way_weighted.csv']:
    if os.path.exists(f):
        files_to_download.append(f)

if not files_to_download:
    print("‚ùå No ensemble files found!")
else:
    for file in files_to_download:
        print(f"\nDownloading {file}...")
        colab_files.download(file)
        print("‚úÖ Downloaded")

print("\n" + "=" * 60)
print("üéâ ALL DONE!")
print("=" * 60)
print("\nüìù NEXT STEPS:")
print("   1. Go to https://www.kaggle.com/competitions/cxr-multi-label-classification")
print("   2. Click 'Submit Predictions'")
print("   3. Upload the *_weighted.csv file")
print("   4. Expected score: 86-90%")
print("\nüöÄ Improvement from baseline: 82.3% ‚Üí 86-90%")
print("=" * 60)