## Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/sklearn/ca_housing_regression.ipynb)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" "matplotlib>=3.5.2,<3.6.0" scikit-learn shap==0.40.0
! pip install -U "mlfoundry>=0.4.2,<0.5.0"

## Initialize MLFoundry Client

In [None]:
import mlfoundry as mlf

client = mlf.get_client()

---

## California Housing Price Prediction as a Regression problem

In [None]:
import os
import getpass
import urllib.parse

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import mlfoundry as mlf

### Load the California Housing dataset

In [None]:
data = datasets.fetch_california_housing(as_frame=True)
print(data.keys())

In [None]:
print(data.DESCR)

In [None]:
data.frame.head()

### Split Dataset into Training and Validation

In [None]:
# Create a Pandas dataframe with all the features
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
feature_columns = X_train.columns.tolist()
X_train = X_train[feature_columns]
X_test = X_test[feature_columns]

print('Feature columns:', feature_columns)
print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

### Start a MLFoundry Run

In [None]:
run = client.create_run(project_name='sklearn-ca-housing-example')

### Set tags for our run

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=15, min_samples_leaf=30)
run.set_tags({'framework': 'sklearn', 'task': 'regression'})

### Training Model

In [None]:
rf_reg.fit(X_train, y_train)

### Logging Parameters & Model

In [None]:
print(rf_reg.get_params())
run.log_params(rf_reg.get_params())
run.log_model(
    name="california-housing-regressor",
    model=rf_reg, 
    framework=mlf.ModelFramework.SKLEARN,
    description="example sklearn random forest regressor - n_estimators=100, max_depth=15, min_samples_leaf=30"
)

### Computing Predictions

In [None]:
y_pred_train = rf_reg.predict(X_train)
y_pred_test = rf_reg.predict(X_test)

### Logging metrics

In [None]:
metrics_dict = {
    'train/mae': mean_absolute_error(y_true=y_train, y_pred=y_pred_train),
    'train/mse': mean_squared_error(y_true=y_train, y_pred=y_pred_train),
    'train/r2_score': r2_score(y_true=y_train, y_pred=y_pred_train),
    'test/mae': mean_absolute_error(y_true=y_test, y_pred=y_pred_test),
    'test/mse': mean_squared_error(y_true=y_test, y_pred=y_pred_test),
    'test/r2_score': r2_score(y_true=y_test, y_pred=y_pred_test)
}
print(metrics_dict)
run.log_metrics(metrics_dict)

### Log the dataset

In [None]:
run.log_dataset(
    dataset_name='train',
    features=X_train,
    predictions=y_pred_train,
    actuals=y_train,
)

In [None]:
run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred_test,
    actuals=y_test,
)

In [None]:
run.end()