# Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/pytorch/ca_housing_regression.ipynb)

## Install dependencies

For torch, it is recommended to follow the instructions at https://pytorch.org/get-started/locally/  
We will use the one already installed, otherwise we will just install the CPU version for now

In [1]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" scikit-learn shap==0.40.0
! pip install --quiet "torch>=1.2.0,<2.0.0"
! pip install -U mlfoundry

You should consider upgrading via the '/Users/chiragjn/Library/Caches/pypoetry/virtualenvs/mlfoundry-jYktQAfc-py3.9/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/Users/chiragjn/Library/Caches/pypoetry/virtualenvs/mlfoundry-jYktQAfc-py3.9/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/Users/chiragjn/Library/Caches/pypoetry/virtualenvs/mlfoundry-jYktQAfc-py3.9/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

## Initialize MLFoundry Client

In [2]:
import os
import getpass
import urllib.parse
import mlfoundry as mlf

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
TFY_URL = os.environ.get('TFY_URL', 'https://app.truefoundry.com/')
TFY_API_KEY = os.environ.get('TFY_API_KEY')
if not TFY_API_KEY:
    print(f'Paste your TrueFoundry API key\nYou can find it over at {urllib.parse.urljoin(TFY_URL, "settings")}')
    TFY_API_KEY = getpass.getpass()

In [4]:
client = mlf.get_client(api_key=TFY_API_KEY)

---

## California Housing Price Prediction as a Regression problem

In [5]:
import random
import math
from timeit import default_timer as timer
import json

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

import mlfoundry as mlf

### Load the California Housing dataset

In [6]:
data = datasets.fetch_california_housing(as_frame=True)
print(data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])


In [7]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [8]:
data.frame.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### Define the model and dataset utilities

In [9]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x.squeeze()

In [10]:
def set_random_seed(seed_value: int, cuda: bool = False):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def make_dataset(X: pd.DataFrame, y: pd.DataFrame):
    dataset = TensorDataset(
        torch.from_numpy(X.values.astype(np.float32)),
        torch.from_numpy(y.values.astype(np.float32))
    )
    return dataset

    
def make_dataloader(dataset, batch_size, random_sampler=False):
    if random_sampler:
        sampler = RandomSampler(dataset)
    else:
        sampler = None
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        sampler=sampler,
        num_workers=1,
        collate_fn=None,
        pin_memory=False,
        drop_last=False,
        timeout=0,
        worker_init_fn=None
    )
    return dataloader



def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Taken from huggingface/transformers
    """
    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )
    return LambdaLR(optimizer, lr_lambda, last_epoch)


def get_optimizer_and_scheduler(model, learning_rate, total_steps, warmup_ratio, weight_decay):
    no_decay = ['bias', 'LayerNorm.weight']
    warmup_steps = math.ceil(total_steps * warmup_ratio)
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay,
        },
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    return optimizer, scheduler

### Define evaluation utilities

In [11]:
def get_eval_metrics(model, device, dataloader):
    model.eval()
    y_true = []
    y_pred = []
    for (batch_input, batch_target) in dataloader:
        batch_input = batch_input.to(device)
        _y_true = batch_target.cpu().numpy()
        predicted = model(batch_input)
        _y_pred = predicted.detach().cpu().numpy()
        
        y_true.append(_y_true)
        y_pred.append(_y_pred)
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)
    return {
        f'mae': mean_absolute_error(y_true=y_true, y_pred=y_pred),
        f'mse': mean_squared_error(y_true=y_true, y_pred=y_pred),
        f'r2': r2_score(y_true=y_true, y_pred=y_pred),
    }

       
def predict_on_dataframe(model, device, X_df, batch_size=64):
    model.eval()
    predictions = []
    y_pred = []
    arr = X_df.values.astype(np.float32)
    for i in range(0, len(X_df), batch_size):
        batch_input = torch.from_numpy(arr[i:i + batch_size])
        batch_input = batch_input.to(device)
        predicted = model(batch_input)
        _y_pred = predicted.detach().cpu().numpy()
        y_pred.append(_y_pred)
    y_pred = np.concatenate(y_pred, axis=0)
    return y_pred

### Finally the training loop

In [12]:
def train(
    run,
    X_train,
    y_train,
    X_val,
    y_val,
    device,
    hidden_size=100,
    learning_rate=0.01,
    batch_size=64, 
    epochs=1,
    warmup_ratio=0.1,
    weight_decay=0.001,
    seed=2022
):
    train_dataset = make_dataset(X=X_train, y=y_train)
    val_dataset = make_dataset(X=X_val, y=y_val)    
    # Create train and validation splits
    train_dataloader = make_dataloader(train_dataset, batch_size=batch_size, random_sampler=True)
    val_dataloader = make_dataloader(val_dataset, batch_size=batch_size, random_sampler=False)
    input_size = X_train.shape[1]
    output_size = 1
    model = DNN(input_size=input_size, hidden_size=hidden_size, output_size=1)
    criterion = torch.nn.MSELoss()
    total_steps = epochs * len(train_dataloader)
    optimizer, scheduler = get_optimizer_and_scheduler(
        model=model, 
        learning_rate=learning_rate, 
        total_steps=total_steps, 
        warmup_ratio=warmup_ratio, 
        weight_decay=weight_decay
    )
    
    run.log_params({
        'input_size': input_size,
        'hidden_size': hidden_size,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'total_steps': total_steps,
        'epochs': epochs,
        'warmup_ratio': warmup_ratio,
        'seed': seed,
        'num_train_samples': len(train_dataset),
        'num_validation_samples': len(val_dataset),
    })
    
    
    model = model.to(device)
    global_step = 0

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_start_time = timer()
        epoch_loss = torch.tensor(0.0).to(device)
        for _step, (batch_input, batch_target) in enumerate(train_dataloader):
            batch_input = batch_input.to(device)
            batch_target = batch_target.to(device)
            batch_predicted = model(batch_input)
            loss = criterion(batch_predicted, batch_target)        
            loss.backward()
            epoch_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1
            step_metrics = {
                'step/lr': scheduler.get_last_lr()[0],
                'step/train_loss': loss.item(),
            }
            if global_step % 20 == 0:
                print(f'step={global_step}', step_metrics)
                run.log_metrics(step_metrics, step=global_step)
                
        epoch_loss = epoch_loss.item() / len(train_dataloader)
        epoch_time = timer() - epoch_start_time

        model.eval()
        epoch_metrics = {
            'epoch/epoch': epoch,
            'epoch/lr': scheduler.get_last_lr()[0],
            'epoch/train_loss': epoch_loss,
            'epoch/time': epoch_time
        }
        train_metrics = get_eval_metrics(model=model, device=device, dataloader=train_dataloader)
        for k, v in train_metrics.items():
            epoch_metrics[f'epoch/train_{k}'] = v
        val_metrics = get_eval_metrics(model=model, device=device, dataloader=val_dataloader)
        for k, v in val_metrics.items():
            epoch_metrics[f'epoch/val_{k}'] = v
        print(f'epoch={epoch}', epoch_metrics)
        run.log_metrics(epoch_metrics, step=global_step)
        
    return model, optimizer, scheduler

### Start a MLFoundry Run

In [13]:
run = client.create_run(project_name='pytorch-ca-housing-example')
print('RUN ID:', run.run_id)
print(f'You can track your runs live at {urllib.parse.urljoin(TFY_URL, "mlfoundry")}')

[mlfoundry] 2022-05-16T16:55:22+0530 INFO project pytorch-ca-housing-example does not exist. Creating pytorch-ca-housing-example.
[mlfoundry] 2022-05-16T16:55:32+0530 INFO Run is created with id d9e1b1e572134b088bc7aab94147f06d and name feel-american-case
RUN ID: d9e1b1e572134b088bc7aab94147f06d
You can track your runs live at https://app.truefoundry.com/mlfoundry


### Set tags for our run

In [14]:
use_gpu = torch.cuda.is_available()
run.set_tags({'framework': 'pytorch', 'use_gpu': use_gpu, 'task': 'regression'})

### Split Dataset into Training and Validation

In [15]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [16]:
SEED = 2022
set_random_seed(SEED, cuda=use_gpu)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=SEED)
feature_columns = X_train.columns.tolist()
X_train = X_train[feature_columns]
X_val = X_val[feature_columns]

print('Feature columns:', feature_columns)
print('Train samples:', len(X_train))
print('Validation samples:', len(X_val))

Feature columns: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Train samples: 16512
Validation samples: 4128


### Train the Model

In [17]:
EPOCHS = 5
device = torch.device('cuda') if use_gpu else torch.device('cpu')
model, optimizer, scheduler = train(
    run=run,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    device=device,
    hidden_size=128,
    learning_rate=0.005,
    batch_size=128, 
    epochs=EPOCHS,
    warmup_ratio=0.1,
    weight_decay=0.001,
    seed=SEED,
)

[mlfoundry] 2022-05-16T16:55:34+0530 INFO Parameters logged successfully
step=20 {'step/lr': 0.0015384615384615387, 'step/train_loss': 25.87055778503418}
[mlfoundry] 2022-05-16T16:55:43+0530 INFO Metrics logged successfully
step=40 {'step/lr': 0.0030769230769230774, 'step/train_loss': 175.3499298095703}
[mlfoundry] 2022-05-16T16:55:50+0530 INFO Metrics logged successfully
step=60 {'step/lr': 0.004615384615384616, 'step/train_loss': 1.731807827949524}
[mlfoundry] 2022-05-16T16:55:53+0530 INFO Metrics logged successfully
step=80 {'step/lr': 0.004870689655172414, 'step/train_loss': 5.2563982009887695}
[mlfoundry] 2022-05-16T16:56:00+0530 INFO Metrics logged successfully
step=100 {'step/lr': 0.004698275862068966, 'step/train_loss': 3.978200674057007}
[mlfoundry] 2022-05-16T16:56:09+0530 INFO Metrics logged successfully
step=120 {'step/lr': 0.004525862068965518, 'step/train_loss': 2.086646556854248}
[mlfoundry] 2022-05-16T16:56:15+0530 INFO Metrics logged successfully
epoch=1 {'epoch/epoch'

### Save and Log model, optimizer and feature column names

In [18]:
fname = f'epochs-{EPOCHS}'
json.dump(feature_columns, open(f'{fname}.features.json', 'w'))
torch.save(model, f'{fname}.model.pth')
torch.save(optimizer, f'{fname}.optim.pth')

In [19]:
run.log_model(model, framework=mlf.ModelFramework.PYTORCH)
run.log_artifact(f'{fname}.optim.pth')
run.log_artifact(f'{fname}.features.json')

[mlfoundry] 2022-05-16T16:59:54+0530 INFO Model logged successfully


### Compute predictions and log train dataset stats

In [20]:
train_df = X_train.copy()
train_df['targets'] = y_train.values
train_df['predictions'] = predict_on_dataframe(model, device, X_train)

In [21]:
train_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,targets,predictions
16289,1.7694,28.0,4.594444,1.063889,3201.0,4.445833,37.92,-121.3,0.739,1.191414
3084,2.0694,18.0,6.889855,1.557971,1366.0,1.97971,35.72,-118.47,0.812,1.270548
53,1.2475,52.0,4.075,1.14,1162.0,2.905,37.82,-122.27,1.042,1.296511
15259,3.9826,25.0,5.745981,1.0,1108.0,3.562701,33.03,-117.27,2.158,1.985452
20007,1.7106,24.0,5.718987,1.032911,1169.0,2.959494,36.13,-119.03,0.955,1.024684


### Log the train dataset and stats

In [22]:
run.log_dataset(
    dataset_name='train',
    features=X_train,
    predictions=train_df['predictions'],
    actuals=train_df['targets'],
)

In [23]:
run.log_dataset_stats(
    train_df, 
    data_slice='train',
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name='predictions',
        actual_column_name='targets'
    ),
    model_type='regression',
)

WARN: Missing config
[mlfoundry] 2022-05-16T17:00:58+0530 INFO Metrics logged successfully
[mlfoundry] 2022-05-16T17:01:04+0530 INFO Dataset stats have been successfully computed and logged


### Compute predictions

In [24]:
val_df = X_val.copy()
val_df['targets'] = y_val.values
val_df['predictions'] = predict_on_dataframe(model, device, X_val)

In [25]:
val_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,targets,predictions
11020,10.157,8.0,9.180451,1.107769,2569.0,3.219298,33.83,-117.81,4.771,4.586352
5919,2.5147,38.0,4.0,1.055363,1359.0,4.702422,34.28,-118.44,1.371,1.287483
6050,3.4609,27.0,4.80504,1.072944,3712.0,4.923077,34.03,-117.74,1.233,1.508507
9627,3.3929,31.0,6.951705,1.321023,1313.0,3.730114,37.21,-120.24,0.938,1.744403
6554,5.7051,35.0,5.727273,0.997326,986.0,2.636364,34.17,-118.07,3.26,3.072451


### Log the validation dataset and stats

In [29]:
run.log_dataset(
    dataset_name='validation',
    features=X_val,
    predictions=val_df['predictions'],
    actuals=val_df['targets'],
)

In [27]:
run.log_dataset_stats(
    val_df, 
    data_slice='validate',  # or mlf.DataSlice.VALIDATE
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name='predictions',
        actual_column_name='targets'
    ),
    model_type='regression',
)

[mlfoundry] 2022-05-16T17:01:40+0530 INFO Metrics logged successfully
[mlfoundry] 2022-05-16T17:01:45+0530 INFO Dataset stats have been successfully computed and logged


In [28]:
run.end()

[mlfoundry] 2022-05-16T17:01:46+0530 INFO Shutting down background jobs and syncing data for run with id 'd9e1b1e572134b088bc7aab94147f06d', please don't kill this process...
