In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

import mlfoundry as mlf

2022-03-14 15:44:06.334 INFO    streamlit_gradio.networking: Hashes generated for all static assets.


## Data preprocessing

In [2]:
data = datasets.load_diabetes()
print(data.keys())

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])


In [3]:
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target
# Show the first five rows
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [4]:
# Create a Pandas dataframe with all the features
X = pd.DataFrame(data = data['data'], columns = data['feature_names'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Creating MLF Runs

In [5]:
mlf_api = mlf.get_client()
mlf_run = mlf_api.create_run(project_name='diabetes-project')
mlf_run_2 = mlf_api.create_run(project_name='diabetes-project')

2022-03-14 15:44:07.553 INFO    mlfoundry.mlfoundry_api: Run is created with id 7cf79850541f4e50978ab2aa2dea1a8a and name run_2022-03-14_22:44:07_utc
2022-03-14 15:44:07.657 INFO    mlfoundry.mlfoundry_api: Run is created with id 1ba095e63ba1473abfa312cfde10af36 and name run_2022-03-14_22:44:07_utc


## Training model

In [6]:
tree_reg = RandomForestRegressor(n_estimators=150, max_depth=10)
tree_reg.fit(X_train, y_train)

tree_reg_1 = RandomForestRegressor(n_estimators=100, max_depth=15)
tree_reg_1.fit(X_train, y_train)

RandomForestRegressor(max_depth=15)

## Logging parameters and models

In [7]:
mlf_run.log_params({"n_estimators":150, "max_depth":10})
mlf_run_2.log_params({"n_estimators":100, "max_depth":15})

mlf_run.log_model(tree_reg, mlf.ModelFramework.SKLEARN)
mlf_run_2.log_model(tree_reg_1, mlf.ModelFramework.SKLEARN)

2022-03-14 15:44:08.172 INFO    mlfoundry.mlfoundry_run: Parameters logged successfully
2022-03-14 15:44:08.176 INFO    mlfoundry.mlfoundry_run: Parameters logged successfully
2022-03-14 15:44:10.737 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2022-03-14 15:44:12.661 INFO    mlfoundry.mlfoundry_run: Model logged Successfully


## Computing predictions

In [9]:
# logging predictions
y_hat_train = tree_reg.predict(X_train)
y_hat_test = tree_reg.predict(X_test)

y_hat_train_tree = tree_reg_1.predict(X_train)
y_hat_test_tree = tree_reg_1.predict(X_test)


## Logging Metrics

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

metrics_dict = {
    "mean absolute error":mean_absolute_error(y_test, y_hat_test),
    "mean squared error": mean_squared_error(y_test, y_hat_test)
}
mlf_run.log_metrics(metrics_dict)

metrics_dict = {
    "mean absolute error":mean_absolute_error(y_test, y_hat_test_tree),
    "mean squared error": mean_squared_error(y_test, y_hat_test_tree)
}
mlf_run_2.log_metrics(metrics_dict)

2022-03-14 16:22:32.015 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2022-03-14 16:22:32.023 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


## log_dataset_stats

In [11]:
import shap

X_test_df = X_test.copy()
X_test_df['targets'] = list(y_test)
X_test_df['predictions'] = list(y_hat_test)

# shap value computation
explainer = shap.TreeExplainer(tree_reg)
shap_values = explainer.shap_values(X_test)

mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=list(data.feature_names),
        prediction_column_name="predictions",
        actual_column_name="targets"
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

X_test_df = X_test.copy()
X_test_df['targets'] = list(y_test)
X_test_df['predictions'] = list(y_hat_test_tree)

# shap value computation
explainer = shap.TreeExplainer(tree_reg_1)
shap_values = explainer.shap_values(X_test)

mlf_run_2.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=list(data.feature_names),
        prediction_column_name="predictions",
        actual_column_name="targets"
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
2022-03-14 16:22:35.286 INFO    whylogs.app.config: No config file loaded
Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
2022-03-14 16:22:35.378 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
2022-03-14 16:22:35.399 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


WARN: Missing config


Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
2022-03-14 16:22:35.987 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
2022-03-14 16:22:36.013 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged
