# Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/pytorch/ca_housing_regression.ipynb)

## Install dependencies

For torch, it is recommended to follow the instructions at https://pytorch.org/get-started/locally/  
We will use the one already installed, otherwise we will just install the CPU version for now

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" scikit-learn shap==0.40.0
! pip install --quiet "torch>=1.2.0,<2.0.0"
! pip install -U "mlfoundry>=0.4.6,<0.5.0"

## Initialize MLFoundry Client

In [None]:
import mlfoundry as mlf

client = mlf.get_client()

---

## California Housing Price Prediction as a Regression problem

In [None]:
import os
import getpass
import urllib.parse
import random
import math
from timeit import default_timer as timer
import json

import shap
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

import mlfoundry as mlf

### Load the California Housing dataset

In [None]:
data = datasets.fetch_california_housing(as_frame=True)
print(data.keys())

In [None]:
print(data.DESCR)

In [None]:
data.frame.head()

### Define the model and dataset utilities

In [None]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x.squeeze()

In [None]:
def set_random_seed(seed_value: int, cuda: bool = False):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def make_dataset(X: pd.DataFrame, y: pd.DataFrame):
    dataset = TensorDataset(
        torch.from_numpy(X.values.astype(np.float32)),
        torch.from_numpy(y.values.astype(np.float32))
    )
    return dataset

    
def make_dataloader(dataset, batch_size, random_sampler=False):
    if random_sampler:
        sampler = RandomSampler(dataset)
    else:
        sampler = None
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        sampler=sampler,
        num_workers=1,
        collate_fn=None,
        pin_memory=False,
        drop_last=False,
        timeout=0,
        worker_init_fn=None
    )
    return dataloader



def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Taken from huggingface/transformers
    """
    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )
    return LambdaLR(optimizer, lr_lambda, last_epoch)


def get_optimizer_and_scheduler(model, learning_rate, total_steps, warmup_ratio, weight_decay):
    no_decay = ['bias', 'LayerNorm.weight']
    warmup_steps = math.ceil(total_steps * warmup_ratio)
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay,
        },
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    return optimizer, scheduler

### Define evaluation utilities

In [None]:
def get_eval_metrics(model, device, dataloader):
    model.eval()
    y_true = []
    y_pred = []
    for (batch_input, batch_target) in dataloader:
        batch_input = batch_input.to(device)
        _y_true = batch_target.cpu().numpy()
        predicted = model(batch_input)
        _y_pred = predicted.detach().cpu().numpy()
        
        y_true.append(_y_true)
        y_pred.append(_y_pred)
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)
    return {
        f'mae': mean_absolute_error(y_true=y_true, y_pred=y_pred),
        f'mse': mean_squared_error(y_true=y_true, y_pred=y_pred),
        f'r2': r2_score(y_true=y_true, y_pred=y_pred),
    }

       
def predict_on_dataframe(model, device, X_df, batch_size=64):
    model.eval()
    predictions = []
    y_pred = []
    arr = X_df.values.astype(np.float32)
    for i in range(0, len(X_df), batch_size):
        batch_input = torch.from_numpy(arr[i:i + batch_size])
        batch_input = batch_input.to(device)
        predicted = model(batch_input)
        _y_pred = predicted.detach().cpu().numpy()
        y_pred.append(_y_pred)
    y_pred = np.concatenate(y_pred, axis=0)
    return y_pred

### Finally the training loop

In [None]:
def train(
    run,
    X_train,
    y_train,
    X_val,
    y_val,
    device,
    hidden_size=100,
    learning_rate=0.01,
    batch_size=64, 
    epochs=1,
    warmup_ratio=0.1,
    weight_decay=0.001,
    seed=2022
):
    train_dataset = make_dataset(X=X_train, y=y_train)
    val_dataset = make_dataset(X=X_val, y=y_val)    
    # Create train and validation splits
    train_dataloader = make_dataloader(train_dataset, batch_size=batch_size, random_sampler=True)
    val_dataloader = make_dataloader(val_dataset, batch_size=batch_size, random_sampler=False)
    input_size = X_train.shape[1]
    output_size = 1
    model = DNN(input_size=input_size, hidden_size=hidden_size, output_size=1)
    criterion = torch.nn.MSELoss()
    total_steps = epochs * len(train_dataloader)
    optimizer, scheduler = get_optimizer_and_scheduler(
        model=model, 
        learning_rate=learning_rate, 
        total_steps=total_steps, 
        warmup_ratio=warmup_ratio, 
        weight_decay=weight_decay
    )
    
    run.log_params({
        'input_size': input_size,
        'hidden_size': hidden_size,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'total_steps': total_steps,
        'epochs': epochs,
        'warmup_ratio': warmup_ratio,
        'seed': seed,
        'num_train_samples': len(train_dataset),
        'num_validation_samples': len(val_dataset),
    })
    
    
    model = model.to(device)
    global_step = 0

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_start_time = timer()
        epoch_loss = torch.tensor(0.0).to(device)
        for _step, (batch_input, batch_target) in enumerate(train_dataloader):
            batch_input = batch_input.to(device)
            batch_target = batch_target.to(device)
            batch_predicted = model(batch_input)
            loss = criterion(batch_predicted, batch_target)        
            loss.backward()
            epoch_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1
            step_metrics = {
                'step/lr': scheduler.get_last_lr()[0],
                'step/train_loss': loss.item(),
            }
            if global_step % 20 == 0:
                print(f'step={global_step}', step_metrics)
                run.log_metrics(step_metrics, step=global_step)
                
        epoch_loss = epoch_loss.item() / len(train_dataloader)
        epoch_time = timer() - epoch_start_time

        model.eval()
        epoch_metrics = {
            'epoch/epoch': epoch,
            'epoch/lr': scheduler.get_last_lr()[0],
            'epoch/train_loss': epoch_loss,
            'epoch/time': epoch_time
        }
        train_metrics = get_eval_metrics(model=model, device=device, dataloader=train_dataloader)
        for k, v in train_metrics.items():
            epoch_metrics[f'epoch/train_{k}'] = v
        val_metrics = get_eval_metrics(model=model, device=device, dataloader=val_dataloader)
        for k, v in val_metrics.items():
            epoch_metrics[f'epoch/val_{k}'] = v
        print(f'epoch={epoch}', epoch_metrics)
        run.log_metrics(epoch_metrics, step=global_step)
        
    return model, optimizer, scheduler

### Start a MLFoundry Run

In [None]:
run = client.create_run(project_name='pytorch-ca-housing-example')

### Set tags for our run

In [None]:
use_gpu = torch.cuda.is_available()
run.set_tags({'framework': 'pytorch', 'use_gpu': use_gpu, 'task': 'regression'})

### Split Dataset into Training and Validation

In [None]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
SEED = 2022
set_random_seed(SEED, cuda=use_gpu)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=SEED)
feature_columns = X_train.columns.tolist()
X_train = X_train[feature_columns]
X_val = X_val[feature_columns]

print('Feature columns:', feature_columns)
print('Train samples:', len(X_train))
print('Validation samples:', len(X_val))

### Train the Model

In [None]:
EPOCHS = 5
device = torch.device('cuda') if use_gpu else torch.device('cpu')
model, optimizer, scheduler = train(
    run=run,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    device=device,
    hidden_size=128,
    learning_rate=0.005,
    batch_size=128, 
    epochs=EPOCHS,
    warmup_ratio=0.1,
    weight_decay=0.001,
    seed=SEED,
)

### Save and Log model, optimizer and feature column names

In [None]:
fname = f'epochs-{EPOCHS}'
json.dump(feature_columns, open(f'{fname}.features.json', 'w'))
torch.save(model, f'{fname}.model.pth')
torch.save(optimizer, f'{fname}.optim.pth')

In [None]:
run.log_model(
    name="california-housing-regressor",
    model=model, 
    framework=mlf.ModelFramework.PYTORCH,
    description=f"pytorch model trained for {EPOCHS} epochs" 
)
run.log_artifact(f'{fname}.optim.pth')
run.log_artifact(f'{fname}.features.json')

### Compute predictions and log train dataset stats

In [None]:
train_df = X_train.copy()
train_df['targets'] = y_train.values
train_df['predictions'] = predict_on_dataframe(model, device, X_train)

In [None]:
train_df.head()

### Log the train dataset

In [None]:
run.log_dataset(
    dataset_name='train',
    features=X_train,
    predictions=train_df['predictions'],
    actuals=train_df['targets'],
)

### Compute predictions

In [None]:
val_df = X_val.copy()
val_df['targets'] = y_val.values
val_df['predictions'] = predict_on_dataframe(model, device, X_val)

In [None]:
val_df.head()

### Log the validation dataset

In [None]:
run.log_dataset(
    dataset_name='validation',
    features=X_val,
    predictions=val_df['predictions'],
    actuals=val_df['targets'],
)

In [None]:
run.end()