# Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/pytorch/sample_regression.ipynb)

# Sample regression with PyTorch

In this notebook we fit a simple Pytorch neural network model to randomly generated data for a regression problem. We will use MLFoundry (by TrueFoundry) to track our experiment run and log important hyperparameters and metrics which can later be viewed at https://app.truefoundry.com/mlfoundry

# Install dependencies

For torch, it is recommended to follow the instructions at https://pytorch.org/get-started/locally/  
We will use the one already installed, otherwise we will just install the CPU version for now

In [None]:
! pip install 'numpy>=1.0.0,<2.0.0' 'pandas>=1.0.0,<2.0.0' 'matplotlib>=3.5.2,<3.6.0' scikit-learn shap==0.40.0
! pip install 'torch>=1.2.0,<2.0.0'
! pip install -U mlfoundry

# Initialize MLFoundry Client

In [None]:
import os
import getpass
import urllib.parse
import mlfoundry as mlf

In [None]:
MLF_API_KEY = os.environ.get('MLF_API_KEY')
if not MLF_API_KEY:
    print("Please get your API key from https://app.truefoundry.com/settings")
    MLF_API_KEY = getpass.getpass("Paste your API key and hit enter:")

In [None]:
client = mlf.get_client(api_key=MLF_API_KEY)

---

In [None]:
import random

import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
def set_random_seed(seed_value: int, cuda: bool = False):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    if cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

## Data preprocessing

For this example, we will generate dummy data
Our input feature would be a single float and 
Our target would be square of the input. We'll add some noise to our training data outputs to simulate noise observed while real world data collection.

In short our regression model to should learn the power function within the input domain of [-1, 1]

In [None]:
set_random_seed(2022)
X = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # shape=(100, 1)
y = X.pow(2) + 0.2 * torch.rand(X.size())               # noisy y, shape=(100, 1)

# convert to torch.Variable for training
X, y = Variable(X), Variable(y)

X_test = torch.rand(100, 1)  
y_test = X.pow(2)        

# visualize data
plt.figure(figsize=(10, 4))
plt.scatter(X.data.numpy(), y.data.numpy(), color='orange')
plt.title('Regression Analysis')
plt.xlabel('Independent varible')
plt.ylabel('Dependent varible')
plt.show()

## Pytorch Model

We will use a simple Neural network with one hidden layer with ReLU as the activation function.
We will use Mean Squared Error Loss as we are dealing with a regression problem

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.output = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.output(x)             # linear output
        return x


loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

## Training loop

Here we pass in a MLFoundry run instance and log our hyperparameters and metrics

In [None]:
def train(epoch, lr, hidden, run, seed=2022):
    set_random_seed(seed)
    
    # Log important hyperparameters
    params = {
        'epoch': epoch,
        'lr': lr,
        'hidden': hidden,
        'seed': seed
    }
    run.log_params(params)
    
    
    net = Net(n_feature=1, n_hidden=hidden, n_output=1)     # define the network
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    fig, ax = plt.subplots(figsize=(12,7))

    for t in range(epoch):
        net.train()
        prediction = net(X)     # input x and predict based on x
        loss = loss_func(prediction, y)
        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        # apply gradients

        
        # Get predictions on test data
        net.eval()
        y_true = y_test.detach().numpy()
        y_true = np.reshape(y_true, y_true.shape[0])
        y_pred = net(X_test).detach().numpy()
        y_pred = np.reshape(y_pred, y_pred.shape[0])

        # Compute Metrics
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)

        # Log them with the run
        run.log_metrics({'mean absolute error': mae, 'mean squared error': mse})

        # plot and show learning process
        plt.cla()
        ax.set_title('Regression Analysis', fontsize=35)
        ax.set_xlabel('Independent variable', fontsize=24)
        ax.set_ylabel('Dependent variable', fontsize=24)
        ax.set_xlim(-1.05, 1.5)
        ax.set_ylim(-0.25, 1.25)
        ax.scatter(X.data.numpy(), y.data.numpy(), color='orange')
        ax.plot(X.data.numpy(), prediction.data.numpy(), 'g-', lw=3)
        ax.text(1.0, 0.1, 'Step = %d' % t, fontdict={'size': 24, 'color': 'red'})
        ax.text(1.0, 0, 'Loss = %.4f' % loss.data.numpy(), fontdict={'size': 24, 'color': 'red'})

        fig.canvas.draw()  
        image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    return net

## Creating a run

In [None]:
run = client.create_run(project_name='pytorch-sample-regression')
print('RUN ID:', run.run_id)
print(f'You can track your runs live at {urllib.parse.urljoin(TFY_URL, "mlfoundry")}')

# Training the model

In [None]:
net = train(epoch=100, lr=0.2, hidden=15, run=run, seed=2022)

In [None]:
net.eval()
y_true = y_test.detach().numpy()
y_true = np.reshape(y_true, y_true.shape[0])
y_pred = net(X_test).detach().numpy()
y_pred = np.reshape(y_pred, y_pred.shape[0])

## Log Test Dataset Stats

In [None]:
X_test_df = pd.DataFrame(X_test.detach().numpy(), columns=['0'])
X_test_df['targets'] = y_true
X_test_df['predictions'] = y_pred

# shap value computation
shap_values = shap.DeepExplainer(net, X)
shap_values = shap_values.shap_values(X_test)

run.log_dataset_stats(
    X_test_df, 
    data_slice='test',
    data_schema=mlf.Schema(
        feature_column_names=['0'],
        prediction_column_name='predictions',
        actual_column_name='targets'
    ),
    shap_values=shap_values,
    model_type='regression',
)

In [None]:
run.end()