In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

In [2]:
train_df = pd.read_csv("../data/titanic_train.csv")

## Preprocess

In [3]:
def _strip_title(row):
    return row.split(",")[1].split(".")[0].replace(" ", "")

def preprocess(df):
    return (
        df
        .assign(Title=lambda x: x["Name"].apply(_strip_title))
        .assign(Fare=lambda x: x["Fare"].round(2))
        .drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    )


preprocessed_train = train_df.pipe(preprocess)

## Train

In [4]:
def init_training(categorical_features, numerical_features):
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])

    pipeline = Pipeline(steps=[
        ('processor', processor),
        ('regressor', RandomForestClassifier(random_state=88))
    ])
    return pipeline


#config
TARGET = "Survived"
CATEGORICAL_COLS = ["Sex", "Embarked", "Title"]
NUMERICAL_COLS = [col for col in preprocessed_train.columns if col not in CATEGORICAL_COLS + [TARGET]]

#model
training_pipeline = init_training(CATEGORICAL_COLS, NUMERICAL_COLS)
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_train[CATEGORICAL_COLS+NUMERICAL_COLS],
    preprocessed_train[TARGET],
    test_size=0.33,
    random_state=88,
)
rf_model = training_pipeline.fit(X_train, y_train)

#save
X_train.assign(Survived=y_train).to_csv("../data/preprocessed_train.csv", index=False)
X_test.assign(Survived=y_test).to_csv("../data/preprocessed_test.csv", index=False)
joblib.dump(rf_model, '../models/rf_model.pkl')

['../models/rf_model.pkl']

## Evaluate

In [5]:
accuracy = accuracy_score(rf_model.predict(X_test), y_test)
accuracy

0.7966101694915254

## Explain

In [6]:
ohe_cols_raw = (
    rf_model
    .named_steps["processor"]
    .named_transformers_["category"]
    .named_steps["one-hot"]
    .get_feature_names_out()
)
OHE_COLS = [
    col
    .replace("x0", "Sex")
    .replace("x1", "Embarked")
    .replace("x2", "Title")
    for col in ohe_cols_raw
]

In [7]:
feature_importance = {key: value for key, value in zip(NUMERICAL_COLS+OHE_COLS, rf_model.named_steps["regressor"].feature_importances_)}
feature_importance

{'Pclass': 0.08027485982957666,
 'Age': 0.2109814530574326,
 'SibSp': 0.0537446367930372,
 'Parch': 0.036290576979770735,
 'Fare': 0.23012234475361906,
 'Sex_female': 0.0923012684295021,
 'Sex_male': 0.08321571192982573,
 'Embarked_C': 0.01263434518233411,
 'Embarked_Q': 0.007338584769864529,
 'Embarked_S': 0.015720887116309576,
 'Title_Capt': 0.000784360481368511,
 'Title_Col': 0.0012917613604970604,
 'Title_Dr': 0.0017767470075199022,
 'Title_Major': 0.0007732933748357258,
 'Title_Master': 0.009336584354394672,
 'Title_Miss': 0.02496022744830678,
 'Title_Mlle': 0.00014551965218339622,
 'Title_Mme': 8.535494893915394e-05,
 'Title_Mr': 0.11288027520793507,
 'Title_Mrs': 0.021862741975250895,
 'Title_Ms': 0.00027664450523918956,
 'Title_Rev': 0.002315642203376659,
 'Title_Sir': 0.0008861786388808405}

## Init trubrics model

In [8]:
from trubrics.context import DataContext, ModelContext

In [9]:
data_context = DataContext(
    name="my_datasource",
    training_data=X_train.assign(Survived=y_train),
    testing_data=X_test.assign(Survived=y_test),
    target_column="Survived"
)
model_context = ModelContext(
    name="my_model",
    version="0.1",
    estimator=rf_model,
    evaluation_function=accuracy_score
)

In [10]:
from trubrics.validators.sklearn import Validator

In [11]:
model_validator = Validator(data=data_context, model=model_context)

## DS example tests

In [12]:
edge_case = {
    "Sex": "male",
    "Embarked": "S",
    "Title": "Master",
    "Pclass": 2,
    "Age": 28,
    "SibSp": 0,
    "Parch": 0,
    "Fare": 37
}

In [13]:
robustness = [
    model_validator.validate_single_edge_case(edge_case_data=edge_case, desired_output=0), # example of fail
    model_validator.validate_single_edge_case(edge_case_data=edge_case, desired_output=1) # example of pass
]
robustness[0].dict()

{'validation_type': 'validate_single_edge_case',
 'validation_kwargs': {'args': [],
  'kwargs': {'edge_case_data': {'Sex': 'male',
    'Embarked': 'S',
    'Title': 'Master',
    'Pclass': 2,
    'Age': 28,
    'SibSp': 0,
    'Parch': 0,
    'Fare': 37},
   'desired_output': 0}},
 'outcome': 'fail',
 'result': {'prediction': 1}}

In [14]:
performance = [
    model_validator.validate_performance_against_threshold(threshold=0.8),
    model_validator.validate_performance_against_threshold(threshold=0.75)
]
performance[0].dict()

{'validation_type': 'validate_performance_against_threshold',
 'validation_kwargs': {'args': [], 'kwargs': {'threshold': 0.8}},
 'outcome': 'fail',
 'result': {'performance': 0.7966101694915254}}

In [15]:
fairness = [
    model_validator.validate_biased_performance_across_category(category="Embarked", threshold=0.1),
    model_validator.validate_biased_performance_across_category(category="Sex", threshold=0.05)
]
fairness[0].dict()

{'validation_type': 'validate_biased_performance_across_category',
 'validation_kwargs': {'args': [],
  'kwargs': {'category': 'Embarked', 'threshold': 0.1}},
 'outcome': 'fail',
 'result': {'max_performance_difference': 0.15235267245317496}}

In [16]:
explainability = [
    model_validator.validate_feature_in_top_n_important_features(feature="Sex_female", feature_importance=feature_importance, top_n_features=3),
    model_validator.validate_feature_in_top_n_important_features(feature="Age", feature_importance=feature_importance, top_n_features=2)
]
explainability[0].dict()

{'validation_type': 'validate_feature_in_top_n_important_features',
 'validation_kwargs': {'args': [],
  'kwargs': {'feature': 'Sex_female',
   'feature_importance': {'Pclass': 0.08027485982957666,
    'Age': 0.2109814530574326,
    'SibSp': 0.0537446367930372,
    'Parch': 0.036290576979770735,
    'Fare': 0.23012234475361906,
    'Sex_female': 0.0923012684295021,
    'Sex_male': 0.08321571192982573,
    'Embarked_C': 0.01263434518233411,
    'Embarked_Q': 0.007338584769864529,
    'Embarked_S': 0.015720887116309576,
    'Title_Capt': 0.000784360481368511,
    'Title_Col': 0.0012917613604970604,
    'Title_Dr': 0.0017767470075199022,
    'Title_Major': 0.0007732933748357258,
    'Title_Master': 0.009336584354394672,
    'Title_Miss': 0.02496022744830678,
    'Title_Mlle': 0.00014551965218339622,
    'Title_Mme': 8.535494893915394e-05,
    'Title_Mr': 0.11288027520793507,
    'Title_Mrs': 0.021862741975250895,
    'Title_Ms': 0.00027664450523918956,
    'Title_Rev': 0.00231564220337665

## Business user example test

In [17]:
from trubrics.utils.loader import get_business_feedback_data
# read test data and run single outlier test
data = get_business_feedback_data(tracking=False)
data

{'feedback_type': 'Other',
 'metadata': {'description': 'model seems to have more errors for passengers who paid less - is this normal?',
  'what_if_input': {'Sex': {'0': 'male'},
   'Embarked': {'0': 'S'},
   'Title': {'0': 'Master'},
   'Pclass': {'0': 2},
   'Age': {'0': 28},
   'SibSp': {'0': 0},
   'Parch': {'0': 0},
   'Fare': {'0': 37}}}}

--> **DS response: "It isn't normal, the model should not be more accurate for different groups of people. I'll add a test for this."**

In [18]:
from trubrics.base import BaseClassifier


class CustomValidator(BaseClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def test_performance_for_different_fares(self, fare_cutoff:int = 50):
        errors_df = self.explore_test_set_errors()
        number_of_errors_by_fare_ratio = (
            errors_df.loc[lambda x: x["Fare"] <= fare_cutoff].shape[0]
            / errors_df.loc[lambda x: x["Fare"] > fare_cutoff].shape[0]
        )
        print(number_of_errors_by_fare_ratio)
        return number_of_errors_by_fare_ratio > 0.5 and number_of_errors_by_fare_ratio < 1.5

In [19]:
model_custom_validator = CustomValidator(data=data_context, model=model_context)

In [20]:
model_custom_validator.test_performance_for_different_fares(fare_cutoff=25)

1.3076923076923077


True

## Save Trubric

In [22]:
validations

[ValidationContext(validation_type='validate_single_edge_case', validation_kwargs={'args': [], 'kwargs': {'edge_case_data': {'Sex': 'male', 'Embarked': 'S', 'Title': 'Master', 'Pclass': 2, 'Age': 28, 'SibSp': 0, 'Parch': 0, 'Fare': 37}, 'desired_output': 0}}, outcome='fail', result={'prediction': 1}),
 ValidationContext(validation_type='validate_single_edge_case', validation_kwargs={'args': [], 'kwargs': {'edge_case_data': {'Sex': 'male', 'Embarked': 'S', 'Title': 'Master', 'Pclass': 2, 'Age': 28, 'SibSp': 0, 'Parch': 0, 'Fare': 37}, 'desired_output': 1}}, outcome='pass', result={'prediction': 1}),
 ValidationContext(validation_type='validate_performance_against_threshold', validation_kwargs={'args': [], 'kwargs': {'threshold': 0.8}}, outcome='fail', result={'performance': 0.7966101694915254}),
 ValidationContext(validation_type='validate_performance_against_threshold', validation_kwargs={'args': [], 'kwargs': {'threshold': 0.75}}, outcome='pass', result={'performance': 0.7966101694915

In [21]:
from trubrics.context import TrubricContext

validations = robustness + performance + fairness + explainability

TrubricContext(
    model_context=model_context,
    data_context=data_context,
    validations=validations
).save(path="../data")

- metadata field for Trubric json
- evaluation function name in perfromance tests and in trubric context