# MLOps Project - Fashion MNIST Training

This notebook runs 5 experiments on Google Colab with GPU acceleration and logs all metrics to DagsHub MLflow.

## Experiments:
1. **Baseline CNN** - No regularization (observe overfitting)
2. **CNN + Regularization** - BatchNorm + Dropout (reduce overfitting)
3. **CNN + Data Augmentation** - Simulate data enrichment (best generalization)
4. **Hyperparameter Tuning** - Optimize learning rate, batch size, epochs
5. **Simple MLP** - Demonstrate underfitting on image data

## 1. Setup Environment

In [None]:
# Install required packages
!pip install -q mlflow torch torchvision scikit-learn pandas matplotlib

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("Warning: GPU not available, using CPU")
    device = torch.device("cpu")

## 2. Configure DagsHub MLflow

**Important**: Replace `YOUR_USERNAME` and `YOUR_TOKEN` with your DagsHub credentials.

Get these from: https://dagshub.com/YOUR_USERNAME/MLOps → Remote → MLflow Tracking

In [None]:
import os
import mlflow

# ============================================
# CONFIGURE YOUR DAGSHUB CREDENTIALS HERE
# ============================================
DAGSHUB_USERNAME = "YOUR_USERNAME"  # <-- CHANGE THIS
DAGSHUB_TOKEN = "YOUR_TOKEN"        # <-- CHANGE THIS
DAGSHUB_REPO_NAME = "MLOps"

# Set up MLflow tracking
MLFLOW_TRACKING_URI = f"https://dagshub.com/{DAGSHUB_USERNAME}/{DAGSHUB_REPO_NAME}.mlflow"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

os.environ["MLFLOW_TRACKING_USERNAME"] = DAGSHUB_USERNAME
os.environ["MLFLOW_TRACKING_PASSWORD"] = DAGSHUB_TOKEN

print(f"MLflow Tracking URI: {MLFLOW_TRACKING_URI}")

## 3. Define Model Architectures

In [None]:
import torch.nn as nn

class CNN(nn.Module):
    """
    Convolutional Neural Network for Fashion MNIST.
    
    Architecture:
        Input (1, 28, 28)
        -> Conv2d(1, 32, 3, padding=1) -> [BatchNorm2d] -> ReLU -> MaxPool2d(2)
        -> Conv2d(32, 64, 3, padding=1) -> [BatchNorm2d] -> ReLU -> MaxPool2d(2)
        -> Flatten
        -> Linear(64*7*7, 128) -> ReLU -> [Dropout]
        -> Linear(128, 10)
        Output (10 classes)
    """
    
    def __init__(self, use_batchnorm=False, dropout_rate=0.0):
        super(CNN, self).__init__()
        
        self.use_batchnorm = use_batchnorm
        self.dropout_rate = dropout_rate
        
        # First convolutional block
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32) if use_batchnorm else nn.Identity()
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        
        # Second convolutional block
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64) if use_batchnorm else nn.Identity()
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        
        # Fully connected layers
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else nn.Identity()
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x


class MLP(nn.Module):
    """
    Multi-Layer Perceptron for Fashion MNIST.
    Used to demonstrate underfitting compared to CNN.
    """
    
    def __init__(self):
        super(MLP, self).__init__()
        
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28 * 28, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x


def get_model(model_type="cnn", use_batchnorm=False, dropout_rate=0.0):
    if model_type.lower() == "cnn":
        return CNN(use_batchnorm=use_batchnorm, dropout_rate=dropout_rate)
    elif model_type.lower() == "mlp":
        return MLP()
    else:
        raise ValueError(f"Unknown model type: {model_type}")


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model architectures defined!")

## 4. Data Loading and Augmentation

In [None]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

def get_transforms(use_augmentation=False):
    """Get data transforms for training and validation."""
    
    val_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    
    if use_augmentation:
        train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomRotation(degrees=10),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])
    else:
        train_transform = val_transform
    
    return train_transform, val_transform


def get_dataloaders(batch_size=64, use_augmentation=False):
    """Create data loaders for Fashion MNIST."""
    
    train_transform, val_transform = get_transforms(use_augmentation)
    
    train_dataset = datasets.FashionMNIST(
        root="./data",
        train=True,
        download=True,
        transform=train_transform
    )
    
    val_dataset = datasets.FashionMNIST(
        root="./data",
        train=False,
        download=True,
        transform=val_transform
    )
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    return train_loader, val_loader

# Download and verify dataset
train_loader, val_loader = get_dataloaders()
print(f"Training samples: {len(train_loader.dataset)}")
print(f"Validation samples: {len(val_loader.dataset)}")

## 5. Training Functions

In [None]:
import torch.optim as optim

def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


def validate(model, val_loader, criterion, device):
    """Validate the model."""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

print("Training functions defined!")

In [None]:
import json

def train_model(
    experiment_name,
    model_type="cnn",
    use_batchnorm=False,
    dropout_rate=0.0,
    use_augmentation=False,
    learning_rate=0.001,
    batch_size=64,
    epochs=10,
    description="",
    save_model=True
):
    """
    Train a model with MLflow tracking.
    """
    print(f"\n{'='*60}")
    print(f"Starting: {experiment_name}")
    print(f"{'='*60}")
    
    # Get data loaders
    train_loader, val_loader = get_dataloaders(
        batch_size=batch_size,
        use_augmentation=use_augmentation
    )
    
    # Create model
    model = get_model(
        model_type=model_type,
        use_batchnorm=use_batchnorm,
        dropout_rate=dropout_rate
    )
    model = model.to(device)
    num_params = count_parameters(model)
    print(f"Model: {model_type.upper()}, Parameters: {num_params:,}")
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Start MLflow run
    with mlflow.start_run(run_name=experiment_name):
        # Log parameters
        mlflow.log_param("experiment_name", experiment_name)
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("description", description)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("dropout_rate", dropout_rate)
        mlflow.log_param("use_batchnorm", use_batchnorm)
        mlflow.log_param("use_augmentation", use_augmentation)
        mlflow.log_param("num_parameters", num_params)
        mlflow.log_param("device", str(device))
        
        # Training loop
        best_val_acc = 0.0
        best_model_state = None
        
        for epoch in range(epochs):
            # Train
            train_loss, train_acc = train_epoch(
                model, train_loader, criterion, optimizer, device
            )
            
            # Validate
            val_loss, val_acc = validate(model, val_loader, criterion, device)
            
            # Log metrics to MLflow
            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("train_accuracy", train_acc, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)
            mlflow.log_metric("val_accuracy", val_acc, step=epoch)
            
            # Track best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                if save_model:
                    best_model_state = model.state_dict().copy()
            
            # Print progress
            print(f"Epoch {epoch+1}/{epochs} - "
                  f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} - "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Log final metrics
        final_train_acc = train_acc
        final_val_acc = val_acc
        overfit_gap = final_train_acc - final_val_acc
        
        mlflow.log_metric("final_train_accuracy", final_train_acc)
        mlflow.log_metric("final_val_accuracy", final_val_acc)
        mlflow.log_metric("final_train_loss", train_loss)
        mlflow.log_metric("final_val_loss", val_loss)
        mlflow.log_metric("overfit_gap", overfit_gap)
        mlflow.log_metric("best_val_accuracy", best_val_acc)
        
        # Get run ID
        run_id = mlflow.active_run().info.run_id
        
        # Prepare results
        results = {
            "run_id": run_id,
            "experiment_name": experiment_name,
            "model_type": model_type,
            "final_train_accuracy": final_train_acc,
            "final_val_accuracy": final_val_acc,
            "final_train_loss": train_loss,
            "final_val_loss": val_loss,
            "best_val_accuracy": best_val_acc,
            "overfit_gap": overfit_gap,
            "hyperparameters": {
                "learning_rate": learning_rate,
                "batch_size": batch_size,
                "epochs": epochs,
                "dropout_rate": dropout_rate,
                "use_batchnorm": use_batchnorm,
                "use_augmentation": use_augmentation
            }
        }
        
        print(f"\nFinal Train Accuracy: {final_train_acc:.4f}")
        print(f"Final Val Accuracy: {final_val_acc:.4f}")
        print(f"Best Val Accuracy: {best_val_acc:.4f}")
        print(f"Overfit Gap: {overfit_gap:.4f}")
        
        return results, best_model_state

print("Main training function defined!")

## 6. Run All 5 Experiments

In [None]:
# Set experiment name in MLflow
mlflow.set_experiment(DAGSHUB_REPO_NAME)

all_results = []
best_result = None
best_val_acc = 0.0
best_model_state = None

### Experiment 1: Baseline CNN

In [None]:
results1, model_state1 = train_model(
    experiment_name="exp1_baseline_cnn",
    model_type="cnn",
    use_batchnorm=False,
    dropout_rate=0.0,
    use_augmentation=False,
    learning_rate=0.001,
    batch_size=64,
    epochs=10,
    description="Baseline CNN without regularization - observe overfitting",
    save_model=True
)
all_results.append(results1)
if results1["best_val_accuracy"] > best_val_acc:
    best_val_acc = results1["best_val_accuracy"]
    best_result = results1
    best_model_state = model_state1

### Experiment 2: CNN + Regularization

In [None]:
results2, model_state2 = train_model(
    experiment_name="exp2_cnn_regularization",
    model_type="cnn",
    use_batchnorm=True,
    dropout_rate=0.5,
    use_augmentation=False,
    learning_rate=0.001,
    batch_size=64,
    epochs=10,
    description="CNN with BatchNorm and Dropout(0.5) - reduce overfitting",
    save_model=True
)
all_results.append(results2)
if results2["best_val_accuracy"] > best_val_acc:
    best_val_acc = results2["best_val_accuracy"]
    best_result = results2
    best_model_state = model_state2

### Experiment 3: CNN + Data Augmentation

In [None]:
results3, model_state3 = train_model(
    experiment_name="exp3_cnn_augmentation",
    model_type="cnn",
    use_batchnorm=True,
    dropout_rate=0.5,
    use_augmentation=True,
    learning_rate=0.001,
    batch_size=64,
    epochs=10,
    description="CNN with regularization and data augmentation - best generalization",
    save_model=True
)
all_results.append(results3)
if results3["best_val_accuracy"] > best_val_acc:
    best_val_acc = results3["best_val_accuracy"]
    best_result = results3
    best_model_state = model_state3

### Experiment 4: Hyperparameter Tuning

In [None]:
results4, model_state4 = train_model(
    experiment_name="exp4_hyperparameter_tuning",
    model_type="cnn",
    use_batchnorm=True,
    dropout_rate=0.5,
    use_augmentation=True,
    learning_rate=0.001,
    batch_size=64,
    epochs=15,  # Increased epochs
    description="Hyperparameter tuning - optimized settings",
    save_model=True
)
all_results.append(results4)
if results4["best_val_accuracy"] > best_val_acc:
    best_val_acc = results4["best_val_accuracy"]
    best_result = results4
    best_model_state = model_state4

### Experiment 5: Simple MLP (Underfitting)

In [None]:
results5, _ = train_model(
    experiment_name="exp5_mlp_comparison",
    model_type="mlp",
    use_batchnorm=False,
    dropout_rate=0.0,
    use_augmentation=False,
    learning_rate=0.001,
    batch_size=64,
    epochs=10,
    description="Simple MLP - demonstrates underfitting on image data",
    save_model=False  # Don't save MLP
)
all_results.append(results5)

## 7. Results Summary

In [None]:
import pandas as pd

print("\n" + "="*80)
print("EXPERIMENTS SUMMARY")
print("="*80)

# Create summary dataframe
data = []
for result in all_results:
    row = {
        "Experiment": result["experiment_name"],
        "Model": result["model_type"].upper(),
        "Train Acc": f"{result['final_train_accuracy']:.4f}",
        "Val Acc": f"{result['final_val_accuracy']:.4f}",
        "Best Val Acc": f"{result['best_val_accuracy']:.4f}",
        "Overfit Gap": f"{result['overfit_gap']:.4f}"
    }
    data.append(row)

df = pd.DataFrame(data)
print(df.to_string(index=False))
print("="*80)
print(f"\nBest Model: {best_result['experiment_name']}")
print(f"Best Validation Accuracy: {best_val_acc:.4f}")

## 8. Save Best Model

In [None]:
import os

# Create models directory
os.makedirs("models", exist_ok=True)

# Save best model weights
model_path = "models/best_model.pt"
torch.save(best_model_state, model_path)
print(f"Best model saved to {model_path}")

# Save model info
model_info_path = "models/best_model_info.json"
with open(model_info_path, "w") as f:
    json.dump(best_result, f, indent=2)
print(f"Model info saved to {model_info_path}")

In [None]:
# Save experiments summary to CSV
os.makedirs("results", exist_ok=True)

summary_data = []
for result in all_results:
    row = {
        "Experiment": result["experiment_name"],
        "Model Type": result["model_type"],
        "Train Accuracy": result["final_train_accuracy"],
        "Val Accuracy": result["final_val_accuracy"],
        "Best Val Accuracy": result["best_val_accuracy"],
        "Overfit Gap": result["overfit_gap"],
        "Learning Rate": result["hyperparameters"]["learning_rate"],
        "Batch Size": result["hyperparameters"]["batch_size"],
        "Epochs": result["hyperparameters"]["epochs"],
        "Dropout": result["hyperparameters"]["dropout_rate"],
        "BatchNorm": result["hyperparameters"]["use_batchnorm"],
        "Augmentation": result["hyperparameters"]["use_augmentation"],
        "Run ID": result["run_id"]
    }
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv("results/experiments_summary.csv", index=False)
print("Results saved to results/experiments_summary.csv")

## 9. Download Files

Run this cell to download the trained model and results to your local machine.

In [None]:
from google.colab import files

# Download model files
print("Downloading best_model.pt...")
files.download("models/best_model.pt")

print("Downloading best_model_info.json...")
files.download("models/best_model_info.json")

print("Downloading experiments_summary.csv...")
files.download("results/experiments_summary.csv")

print("\nAll files downloaded!")
print("\nNext steps:")
print("1. Copy best_model.pt to your local MLOps/models/ folder")
print("2. Copy best_model_info.json to your local MLOps/models/ folder")
print("3. Copy experiments_summary.csv to your local MLOps/results/ folder")
print("4. Git add, commit, and push to GitHub")
print("5. GitHub Actions will build and push Docker image")

## 10. View Results in DagsHub

Your experiment results are now available at:

**https://dagshub.com/YOUR_USERNAME/MLOps** → Experiments tab → MLflow UI

You can:
- Compare all 5 experiments
- View training curves (loss and accuracy over epochs)
- See all hyperparameters and metrics
- Share the link with your instructor

In [None]:
print(f"\nView your results at:")
print(f"https://dagshub.com/{DAGSHUB_USERNAME}/{DAGSHUB_REPO_NAME}")
print(f"\nMLflow UI:")
print(f"{MLFLOW_TRACKING_URI}")