## Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/sklearn/breast_cancer_classification.ipynb)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" scikit-learn shap==0.40.0
! pip install -U mlfoundry

## Initialize MLFoundry Client

In [None]:
import os
import getpass
import urllib.parse
import mlfoundry as mlf

In [None]:
TFY_URL = os.environ.get('TFY_URL', 'https://app.truefoundry.com/')
TFY_API_KEY = os.environ.get('TFY_API_KEY')
if not TFY_API_KEY:
    print(f'Paste your TrueFoundry API key\nYou can find it over at {urllib.parse.urljoin(TFY_URL, "settings")}')
    TFY_API_KEY = getpass.getpass()

In [None]:
client = mlf.get_client(api_key=TFY_API_KEY)

---

## Breast Cancer Detection as a Classification problem

In [1]:
import shap
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

import mlfoundry as mlf

  from .autonotebook import tqdm as notebook_tqdm


### Loading data and preprocessing

In [None]:
data = datasets.load_breast_cancer()
print(data.keys())

In [None]:
print(data.DESCR) 

In [None]:
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target
# Show the first five rows
df.head()

### Start MLFoundry Run(s)

In [None]:
run = client.create_run(project_name='cancer-project')
print('RUN 1 ID:', run.run_id)
print(f'You can track your runs live at {urllib.parse.urljoin(TFY_URL, "mlfoundry")}')

### Log the dataset

In [None]:
# Store the feature data
X = pd.DataFrame(data.data, columns=data.feature_names)
# store the target data
y = data.target

run.log_dataset(
    dataset_name='breast_cancer_dataset',
    features=X,
    actuals=y
)

### Split Dataset into Training and Validation

In [None]:
# split the data using scikit-learn's train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)
print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

### Setting tags and Logging parameters

In [None]:
clf = RandomForestClassifier(n_estimators=50, max_depth=15)
run.set_tags({'framework': 'sklearn', 'task': 'classification'})
run.log_params({'n_estimators': 50, 'max_depth': 15})

### Training model and logging model

In [None]:
clf.fit(X_train, y_train)
run.log_model(clf, framework=mlf.ModelFramework.SKLEARN)

## Computing predictions

In [None]:
# logging predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

## Logging metrics

In [None]:
metrics = {
    'train/accuracy_score': accuracy_score(y_train, y_pred_train),
    'train/f1': f1_score(y_train, y_pred_train),
    'test/accuracy_score': accuracy_score(y_test, y_pred_test),
    'test/f1': f1_score(y_test, y_pred_test),
}
print('Tree 1 metrics:', metrics)
run.log_metrics(metrics)

## Log Test dataset stats

In [None]:
X_test_df = X_test.copy()
X_test_df['targets'] = list(y_test)
X_test_df['predictions'] = list(y_pred_test)
X_test_df['prediction_probabilities'] = list(clf.predict_proba(X_test))

# shap value computation model 1 test set
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

run.log_dataset_stats(
    X_test_df, 
    data_slice='test',
    data_schema=mlf.Schema(
        feature_column_names=list(data.feature_names),
        prediction_column_name='predictions',
        actual_column_name='targets',
        prediction_probability_column_name='prediction_probabilities'
    ),
    shap_values=shap_values,
    model_type='binary_classification',
)

In [None]:
run.end()

## Training another model with different hyperparameters

In [None]:
run = client.create_run(project_name='cancer-project')
print('RUN 2 ID:', run.run_id)

# log dataset
run.log_dataset(
    dataset_name='breast_cancer_dataset',
    features=X,
    actuals=y
)

clf = RandomForestClassifier(n_estimators=150, max_depth=10)
run.set_tags({'framework': 'sklearn', 'task': 'classification'})
run.log_params({'n_estimators': 150, 'max_depth': 10})

clf.fit(X_train, y_train)
run.log_model(clf, framework=mlf.ModelFramework.SKLEARN)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

metrics = {
    'train/accuracy_score': accuracy_score(y_train, y_pred_train),
    'train/f1': f1_score(y_train, y_pred_train),
    'test/accuracy_score': accuracy_score(y_test, y_pred_test),
    'test/f1': f1_score(y_test, y_pred_test),
}
print('Tree 2 metrics:', metrics)
run.log_metrics(metrics)


X_test_df = X_test.copy()
X_test_df['targets'] = list(y_test)
X_test_df['predictions'] = list(y_pred_test)
X_test_df['prediction_probabilities'] = list(clf.predict_proba(X_test))

# shap value computation Model 2 test set
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)


run.log_dataset_stats(
    X_test_df, 
    data_slice='test',
    data_schema=mlf.Schema(
        feature_column_names=list(data.feature_names),
        prediction_column_name='predictions',
        actual_column_name='targets',
        prediction_probability_column_name='prediction_probabilities'
    ),
    shap_values=shap_values,
    model_type='binary_classification',
)

run.end()