In [1]:
import pandas as pd
import numpy as np
import mlfoundry as mlf
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split

# Loading Iris Dataset

In [2]:
iris = datasets.load_iris()
iris_frame = pd.DataFrame(iris.data, columns = iris.feature_names)

# Using MlFoundry APIs

## Creating run

In [3]:
mlf_api = mlf.get_client() # to save locally
mlf_run = mlf_api.create_run(project_name='sklearn-project')

## Logging the dataset

In [4]:
mlf_run.log_dataset(iris_frame, data_slice=mlf.DataSlice.TRAIN)  # saves in parquet format
mlf_run.log_dataset(iris_frame, data_slice=mlf.DataSlice.TEST, fileformat=mlf.FileFormat.CSV) # saves in csv format

## Training the model

In [5]:
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model Training
clf = svm.SVC(gamma='scale', kernel='rbf', probability=True)
clf.fit(X, y)

SVC(probability=True)

## Logging Parameters

In [6]:
params = {'classes': clf.classes_, 'features': clf.n_features_in_}
mlf_run.log_params(params)

## Logging the model

In [7]:
mlf_run.log_model(clf, mlf.ModelFramework.SKLEARN)

## Logging Predictions Synchronously

In [8]:
y_hat_train = clf.predict(X_train)
y_hat_test = clf.predict(X_test)
mlf_run.log_predictions(pd.DataFrame(X_test), list(y_hat_test))

## Logging Predictions Asynchronously

In [9]:
responses = mlf_run.log_predictions_async(pd.DataFrame(X_test), list(y_hat_test))

#### To confirm that the log request completed successfully, await for futures to resolve: This is a blocking call
import concurrent.futures as cf
for response in cf.as_completed(responses):
  res = response.result()


## Logging the metrics

In [10]:
from sklearn.metrics import accuracy_score, f1_score
metrics_dict = {}

metrics_dict['accuracy_score'] = accuracy_score(y_test, y_hat_test)
metrics_dict['f1_score'] = f1_score(y_test, y_hat_test, average='micro')

mlf_run.log_metrics(metrics_dict)


## Logging the Dataset Stats

In [11]:
import shap

y_train_prob = clf.predict_proba(X_train)


X_train_df = pd.DataFrame(X_train, columns=iris.feature_names)
X_train_df['targets'] = y_train
X_train_df['predictions'] = y_hat_train
X_train_df['prediction_probabilities'] = list(y_train_prob)

X_test_df = pd.DataFrame(X_test, columns=iris.feature_names)
X_test_df['targets'] = y_test
X_test_df['predictions'] = y_hat_test

# compute and log stats for train data without shap
mlf_run.log_dataset_stats(
    X_train_df, 
    data_slice=mlf.DataSlice.TRAIN,
    data_schema=mlf.Schema(
        feature_column_names=iris.feature_names,
        prediction_column_name="predictions",
        actual_column_name="targets",
        prediction_probability_column_name="prediction_probabilities"   # to calculate probability related metrics
    ),
    model_type=mlf.ModelType.MULTICLASS_CLASSIFICATION,
)

# shap value computation
X_train_df1 = pd.DataFrame(X_train, columns=iris.feature_names)
X_test_df1 = pd.DataFrame(X_test, columns=iris.feature_names)
explainer = shap.KernelExplainer(clf.predict_proba, X_train_df1)
shap_values = explainer.shap_values(X_test_df1)

mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=iris.feature_names,
        prediction_column_name="predictions",
        actual_column_name="targets"
    ),
    model_type=mlf.ModelType.MULTICLASS_CLASSIFICATION,
    shap_values=shap_values
)

Using 120 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


WARN: Missing config


100%|██████████| 30/30 [00:01<00:00, 25.78it/s]
