## Try this Notebook in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truefoundry/mlfoundry-examples/blob/main/examples/sklearn/iris_classification.ipynb)

## Install dependencies

In [None]:
! pip install --quiet "numpy>=1.0.0,<2.0.0" "pandas>=1.0.0,<2.0.0" "matplotlib>=3.5.2,<3.6.0" scikit-learn shap==0.40.0
! pip install -U "mlfoundry>=0.4.6,<0.5.0"

## Initialize MLFoundry Client

In [None]:
import os
import getpass
import urllib.parse
import mlfoundry as mlf

In [None]:
client = mlf.get_client()

---

## Iris plants detection as a Classification problem

In [None]:
import urllib.parse

import shap
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

import mlfoundry as mlf

### Loading data and preprocessing

In [None]:
data = datasets.load_iris()
print(data.keys())

In [None]:
print(data.DESCR) 

In [None]:
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target
# Show the first five rows
df.head()

### Split Dataset into Training and Validation

In [None]:
# Store the feature data
X = pd.DataFrame(data.data, columns=data.feature_names)
# store the target data
y = data.target

In [None]:
# split the data using scikit-learn's train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)
print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

### Start MLFoundry Run

In [None]:
run = client.create_run(project_name='iris-clf-demo', run_name='svm-model')
print('RUN ID:', run.run_id)

### Setting tags

In [None]:
clf = SVC(gamma='scale', kernel='rbf', probability=True, C=1.2)
run.set_tags({'framework': 'sklearn', 'task': 'classification'})

### Training model, logging parameters and logging model

In [None]:
clf.fit(X_train, y_train)
print(clf.get_params())
run.log_params(clf.get_params())
run.log_params({'classes': clf.classes_, 'features': clf.n_features_in_})
model_version=run.log_model(
    name="iris-model-demo",
    model=clf,
    framework=mlf.ModelFramework.SKLEARN,
    description="sklearn framework used"
)
print(model_version.fqn)

### Computing predictions

In [None]:
# logging predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

### Logging metrics

In [None]:
metrics = {
    'train/accuracy_score': accuracy_score(y_train, y_pred_train),
    'train/f1_weighted': f1_score(y_train, y_pred_train, average='weighted'),
    'train/f1_mirco': f1_score(y_train, y_pred_train, average='micro'),
    'train/f1_macro': f1_score(y_train, y_pred_train, average='macro'),
    'test/accuracy_score': accuracy_score(y_test, y_pred_test),
    'test/f1_weighted': f1_score(y_test, y_pred_test, average='weighted'),
    'test/f1_mirco': f1_score(y_test, y_pred_test, average='micro'),
    'test/f1_macro': f1_score(y_test, y_pred_test, average='macro'),
}
print('Metrics', metrics)
run.log_metrics(metrics)

### Log the dataset

In [None]:
run.log_dataset(
    dataset_name='train',
    features=X_train,
    predictions=y_pred_train,
    actuals=y_train,
)

In [None]:
run.log_dataset(
    dataset_name='test',
    features=X_test,
    predictions=y_pred_test,
    actuals=y_test,
)

In [None]:
run.end()