# ICD-10 Classification with BERT - Colab GPU Training

This notebook is designed to run on Google Colab with GPU acceleration for efficient training.

In [None]:
# Check GPU availability and setup
!nvidia-smi
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Install required packages
!pip install transformers datasets scikit-learn accelerate -q
!pip install sentencepiece -q  # Sometimes needed for certain tokenizers

In [None]:
# Fix common Colab dataset loading issues
!pip install datasets --upgrade -q
!pip install huggingface_hub --upgrade -q

# Clear any problematic cache
import os
import shutil

# Clear HuggingFace cache
hf_cache = os.path.expanduser("~/.cache/huggingface")
if os.path.exists(hf_cache):
    print("Clearing HuggingFace cache...")
    try:
        shutil.rmtree(hf_cache)
        print("✅ Cache cleared")
    except Exception as e:
        print(f"⚠️ Cache clearing issue: {e}")

print("Environment prepared for dataset loading")

## 🛠️ Troubleshooting Dataset Loading

If you encounter dataset loading errors, try these solutions in order:

1. **Run the cache clearing cell above**
2. **Restart runtime**: Runtime → Restart runtime (then re-run from the beginning)
3. **Use streaming mode**: Uncomment the manual loading cell
4. **Alternative dataset**: Switch to a different medical NLP dataset if needed

**Common Error**: `NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported`
**Solution**: Clear cache and restart runtime, then try again

In [None]:
# Upload your training scripts to Colab
# Method 1: Upload files directly
from google.colab import files
print("Upload train.py and evaluate.py files:")
uploaded = files.upload()

# Method 2: Alternative - copy-paste the code directly into cells below
# (Recommended if you want to modify training parameters easily)

In [None]:
# For development: upload train.py and evaluate.py manually or use:
# from google.colab import files
# uploaded = files.upload()  # Upload train.py and evaluate.py

## Quick Data Exploration

In [None]:
from datasets import load_dataset
import pandas as pd
from collections import Counter
import numpy as np
import os

# Clear any cached dataset files to avoid caching issues
cache_dir = os.path.expanduser("~/.cache/huggingface/datasets")
if os.path.exists(cache_dir):
    print("Clearing dataset cache to avoid loading issues...")
    import shutil
    try:
        shutil.rmtree(cache_dir)
        print("✅ Cache cleared")
    except:
        print("⚠️ Cache clearing failed, continuing anyway...")

# Load the dataset with cache disabled
print("Loading dataset...")
try:
    dataset = load_dataset("FiscaAI/synth-ehr-icd10cm-prompt", cache_dir=None)
except Exception as e:
    print(f"First attempt failed: {e}")
    print("Trying alternative approach...")
    # Alternative: Load with different cache settings
    dataset = load_dataset("FiscaAI/synth-ehr-icd10cm-prompt", download_mode="force_redownload")

print(f"✅ Dataset loaded successfully!")
print(f"Dataset structure: {dataset}")
print(f"Train samples: {len(dataset['train'])}")

# Examine structure
print("\nFirst example:")
first_example = dataset['train'][0]
print(f"Clinical Note: {first_example['user'][:300]}...")
print(f"ICD Codes: {first_example['codes']}")
print(f"ICD Code type: {type(first_example['codes'])}")

# Take a sample for analysis (start smaller to avoid memory issues)
sample_size = min(5000, len(dataset['train']))
print(f"\nTaking sample of {sample_size} examples for analysis...")
sample_data = dataset['train'].select(range(sample_size))
print("✅ Sample created successfully")

In [None]:
# Alternative: Manual dataset loading if cache issues persist
# This cell provides a backup method for loading the dataset

def load_dataset_manual():
    """Backup method to load dataset if caching issues occur"""
    print("Using manual dataset loading method...")
    
    # Method 1: Load smaller chunks
    try:
        from datasets import load_dataset
        dataset = load_dataset("FiscaAI/synth-ehr-icd10cm-prompt", streaming=True)
        
        # Convert streaming to regular dataset with limited samples
        train_data = []
        for i, example in enumerate(dataset['train']):
            if i >= 10000:  # Limit to first 10k for memory efficiency
                break
            train_data.append(example)
        
        from datasets import Dataset
        dataset = {'train': Dataset.from_list(train_data)}
        
        print(f"✅ Manual loading successful! Loaded {len(dataset['train'])} examples")
        return dataset
        
    except Exception as e:
        print(f"Manual loading failed: {e}")
        return None

# Uncomment the line below if the cell above fails
# dataset = load_dataset_manual()

In [None]:
# Analyze label distribution on sample
all_codes = []
for example in sample_data:
    codes = example['codes']
    if isinstance(codes, str):
        codes = [codes]
    all_codes.extend(codes)

code_counts = Counter(all_codes)
print(f"Total unique ICD codes in sample: {len(code_counts)}")
print(f"Total code instances: {len(all_codes)}")
print(f"Average codes per note: {len(all_codes) / len(sample_data):.2f}")

print(f"\nTop 15 most common codes:")
for code, count in code_counts.most_common(15):
    print(f"  {code}: {count} ({count/len(sample_data)*100:.1f}%)")

# Check label distribution
multi_label_count = sum(1 for ex in sample_data 
                       if isinstance(ex['codes'], list) and len(ex['codes']) > 1)
print(f"\nMulti-label examples: {multi_label_count} ({multi_label_count/len(sample_data)*100:.1f}%)")

# Text length analysis
text_lengths = [len(ex['user'].split()) for ex in sample_data]
print(f"\nText length stats:")
print(f"  Mean: {np.mean(text_lengths):.1f} words")
print(f"  Median: {np.median(text_lengths):.1f} words")
print(f"  Max: {np.max(text_lengths)} words")
print(f"  95th percentile: {np.percentile(text_lengths, 95):.1f} words")

## GPU-Accelerated Training

In [None]:
# GPU-Optimized Training - Recommended Settings
# A100 GPU can handle much larger batches and longer sequences

print("Starting GPU-optimized training...")
print("This will take 15-30 minutes depending on sample size")

!python train.py \
    --model_name "emilyalsentzer/Bio_ClinicalBERT" \
    --batch_size 32 \
    --epochs 8 \
    --learning_rate 3e-5 \
    --max_length 512 \
    --sample_size 15000 \
    --fp16 \
    --output_dir ./model_gpu_trained

print("\nTraining completed! Model saved to ./model_gpu_trained")

In [None]:
# Alternative: Quick Training for Testing (5-10 minutes)
# Use this for rapid iteration and testing

print("Quick training for testing...")

!python train.py \
    --model_name "emilyalsentzer/Bio_ClinicalBERT" \
    --batch_size 64 \
    --epochs 3 \
    --learning_rate 2e-5 \
    --max_length 256 \
    --fp16 \
    --sample_size 5000 \
    --output_dir ./model_quick_test

print("\nQuick training completed!")

## Evaluation

In [None]:
# Comprehensive Evaluation on GPU-trained model
print("Running evaluation on GPU-trained model...")

!python evaluate.py --model_dir ./model_gpu_trained --sample_size 2000

# Also evaluate the quick test model for comparison
print("\n" + "="*50)
print("Comparing with quick test model...")
!python evaluate.py --model_dir ./model_quick_test --sample_size 1000

In [None]:
# Load and compare results from both models
import json
import matplotlib.pyplot as plt
import pandas as pd

# Load GPU-trained model results
try:
    with open('./model_gpu_trained/evaluation_results.json', 'r') as f:
        gpu_results = json.load(f)
    print("GPU-Trained Model Results:")
    print(f"  Micro F1: {gpu_results['micro_f1']:.4f}")
    print(f"  Macro F1: {gpu_results['macro_f1']:.4f}")
    print("  Top-K Accuracy:")
    for k, acc in gpu_results['top_k_accuracy'].items():
        print(f"    Top-{k}: {acc:.4f}")
except:
    print("GPU model results not found")

# Load quick test model results
try:
    with open('./model_quick_test/evaluation_results.json', 'r') as f:
        quick_results = json.load(f)
    print("\nQuick Test Model Results:")
    print(f"  Micro F1: {quick_results['micro_f1']:.4f}")
    print(f"  Macro F1: {quick_results['macro_f1']:.4f}")
    print("  Top-K Accuracy:")
    for k, acc in quick_results['top_k_accuracy'].items():
        print(f"    Top-{k}: {acc:.4f}")
except:
    print("Quick test model results not found")

# Create comparison visualization
if 'gpu_results' in locals() and 'quick_results' in locals():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # F1 Scores comparison
    models = ['GPU Trained', 'Quick Test']
    micro_f1s = [gpu_results['micro_f1'], quick_results['micro_f1']]
    macro_f1s = [gpu_results['macro_f1'], quick_results['macro_f1']]
    
    x = range(len(models))
    ax1.bar([i-0.2 for i in x], micro_f1s, 0.4, label='Micro F1', alpha=0.7)
    ax1.bar([i+0.2 for i in x], macro_f1s, 0.4, label='Macro F1', alpha=0.7)
    ax1.set_xlabel('Model')
    ax1.set_ylabel('F1 Score')
    ax1.set_title('F1 Score Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels(models)
    ax1.legend()
    
    # Top-K Accuracy comparison
    k_values = [1, 5, 10]
    gpu_topk = [gpu_results['top_k_accuracy'][str(k)] for k in k_values]
    quick_topk = [quick_results['top_k_accuracy'][str(k)] for k in k_values]
    
    ax2.plot(k_values, gpu_topk, 'o-', label='GPU Trained', linewidth=2)
    ax2.plot(k_values, quick_topk, 's-', label='Quick Test', linewidth=2)
    ax2.set_xlabel('K (Top-K)')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Top-K Accuracy Comparison')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Quick Experimentation Section

In [None]:
# 🧪 OPTIONAL: Model Architecture Experiments
# Try different BERT variants to see which performs best

models_to_try = [
    ("Clinical BERT", "emilyalsentzer/Bio_ClinicalBERT"),
    ("PubMed BERT", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"),
    ("General BERT", "bert-base-uncased"),
    ("DistilBERT", "distilbert-base-uncased")
]

experiment_results = {}

print("🧪 Running model comparison experiments...")
print("This will take 30-45 minutes total")

for model_name, model_path in models_to_try:
    print(f"\n🔄 Training {model_name}...")
    output_dir = f"./experiments/{model_name.lower().replace(' ', '_')}"
    
    # Train each model with same settings
    !python train.py \
        --model_name {model_path} \
        --batch_size 64 \
        --epochs 2 \
        --learning_rate 2e-5 \
        --max_length 256 \
        --fp16 \
        --sample_size 3000 \
        --output_dir {output_dir}
    
    # Evaluate
    !python evaluate.py --model_dir {output_dir} --sample_size 500
    
    print(f"✅ {model_name} completed")

print("\n🏆 Experiment completed! Check individual results in ./experiments/")

# Compare all results
print("\n📊 COMPARISON SUMMARY:")
for model_name, _ in models_to_try:
    dir_name = f"./experiments/{model_name.lower().replace(' ', '_')}"
    try:
        with open(f'{dir_name}/evaluation_results.json', 'r') as f:
            results = json.load(f)
        print(f"{model_name:15} - Micro F1: {results['micro_f1']:.4f}, Top-5: {results['top_k_accuracy']['5']:.4f}")
    except:
        print(f"{model_name:15} - Results not found")

In [None]:
# 📦 EXPORT TRAINED MODEL FOR LOCAL USE 

print("🚀 Preparing model for download...")

# Zip the main GPU-trained model
!zip -r gpu_trained_model.zip model_gpu_trained/
print("✅ GPU-trained model zipped")

# Also zip the quick test model for comparison
!zip -r quick_test_model.zip model_quick_test/
print("✅ Quick test model zipped")

# Check file sizes
!ls -lh *.zip

print("\n📋 INSTRUCTIONS FOR LOCAL USE:")
print("1. Download both zip files to your local machine")
print("2. Extract: unzip gpu_trained_model.zip")
print("3. Use locally: python evaluate.py --model_dir ./model_gpu_trained")
print("4. The model will work on CPU for inference (much faster than training)")

# Download files
from google.colab import files
print("\n⬇️ Downloading GPU-trained model...")
files.download('gpu_trained_model.zip')

print("⬇️ Downloading quick test model...")
files.download('quick_test_model.zip')

print("\n🎉 Models ready for local deployment!")

## 🚀 Quick Start Guide for Colab

**RECOMMENDED WORKFLOW:**

1. **Upload Scripts**: Run cell above to upload `train.py` and `evaluate.py`
2. **Quick Test**: Run the "Quick Training" cell first (5-10 minutes)
3. **Full Training**: If satisfied, run the "GPU-Optimized Training" (15-30 minutes)
4. **Download Model**: Use the export cell to download trained weights
5. **Local Deployment**: Extract and use the model locally for inference

**EXPECTED PERFORMANCE:**
- Quick Test: Micro F1 ~0.01-0.05, Top-5 Accuracy ~0.05-0.15
- Full GPU Training: Micro F1 ~0.05-0.15, Top-5 Accuracy ~0.15-0.35
- Much better than local CPU training!