In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

In [None]:
train_df = pd.read_csv("../data/train.csv")

## preprocess

In [None]:
def _strip_title(row):
    return row.split(",")[1].split(".")[0].replace(" ", "")

def preprocess(df):
    return (
        df
        .assign(Title=lambda x: x["Name"].apply(_strip_title))
        .assign(Fare=lambda x: x["Fare"].round(2))
        .drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    )


preprocessed_train = train_df.pipe(preprocess)
preprocessed_train.to_csv("../data/preprocessed_train.csv", index=False)

## train

In [None]:
def init_training(categorical_features, numerical_features):
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])

    pipeline = Pipeline(steps=[
        ('processor', processor),
        ('regressor', RandomForestClassifier(random_state=88))
    ])
    return pipeline


#config
TARGET = "Survived"
CATEGORICAL_COLS = ["Sex", "Embarked", "Title"]
NUMERICAL_COLS = [col for col in preprocessed_train.columns if col not in CATEGORICAL_COLS + [TARGET]]

#model
training_pipeline = init_training(CATEGORICAL_COLS, NUMERICAL_COLS)
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_train[CATEGORICAL_COLS+NUMERICAL_COLS],
    preprocessed_train[TARGET],
    test_size=0.33,
    random_state=88,
)
rf_model = training_pipeline.fit(X_train, y_train)

#save
joblib.dump(rf_model, '../models/rf_model.pkl')

## evaluate

In [None]:
accuracy = accuracy_score(rf_model.predict(X_test), y_test)
accuracy

## explain

In [None]:
ohe_cols_raw = (
    rf_model
    .named_steps["processor"]
    .named_transformers_["category"]
    .named_steps["one-hot"]
    .get_feature_names_out()
)
OHE_COLS = [
    col
    .replace("x0", "Sex")
    .replace("x1", "Embarked")
    .replace("x2", "Title")
    for col in ohe_cols_raw
]

In [None]:
feature_importance = {key: value for key, value in zip(NUMERICAL_COLS+OHE_COLS, rf_model.named_steps["regressor"].feature_importances_)}
feature_importance

## Init trubrics model

In [None]:
from trubrics.context import DataContext, ModelContext

In [None]:
data_context = DataContext(
    name="my_datasource",
    training_data=X_train.assign(Survived=y_train),
    testing_data=X_test.assign(Survived=y_test),
    target_column="Survived"
)
model_context = ModelContext(
    name="my_model",
    version="0.1",
    estimator=rf_model,
    evaluation_function=accuracy_score
)

In [None]:
from trubrics.testers.sklearn import SklearnTester

In [None]:
model_tester = SklearnTester(data=data_context, model=model_context)

In [None]:
model_tester.list_model_features()

## DS example tests

In [None]:
# robustness test
(
    model_tester.test_single_edge_case(edge_case_data=data_context.testing_data.tail(1), desired_output=1), # example of fail
    model_tester.test_single_edge_case(edge_case_data=data_context.testing_data.head(1), desired_output=1) # example of pass
)

In [None]:
# performance test
(
    model_tester.test_performance_against_threshold(threshold=0.8),
    model_tester.test_performance_against_threshold(threshold=0.75)
)

In [None]:
# fairness test
(
    model_tester.test_biased_performance_across_category(category="Embarked", threshold=0.1),
    model_tester.test_biased_performance_across_category(category="Sex", threshold=0.05)
)

In [None]:
# explainability test
(
    model_tester.test_feature_in_top_n_important_features(feature="Sex_female", feature_importance=feature_importance, top_n_features=3),
    model_tester.test_feature_in_top_n_important_features(feature="Age", feature_importance=feature_importance, top_n_features=2)
)

## Business user example tests

In [None]:
from trubrics.utils.loader import get_business_test_data
# read test data and run single outlier test
data, expected_outcome = get_business_test_data(tracking=False)

In [None]:
model_tester.test_single_edge_case(edge_case_data=data, desired_output=expected_outcome)