In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

In [None]:
train_df = pd.read_csv("../data/train.csv")

## preprocess

In [None]:
def _strip_title(row):
    return row.split(",")[1].split(".")[0].replace(" ", "")

def preprocess(df):
    return (
        df
        .assign(Title=lambda x: x["Name"].apply(_strip_title))
        .assign(Fare=lambda x: x["Fare"].round(2))
        .drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    )


preprocessed_train = train_df.pipe(preprocess)

## train

In [None]:
def init_training(categorical_features, numerical_features):
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])

    pipeline = Pipeline(steps=[
        ('processor', processor),
        ('regressor', RandomForestClassifier(random_state=88))
    ])
    return pipeline


def get_features(df, target):
     return [col for col in df.columns if col != target]


#config
TARGET = "Survived"
CATEGORICAL_COLS = ["Sex", "Embarked", "Title"]
NUMERICAL_COLS = [col for col in preprocessed_train.columns if col not in CATEGORICAL_COLS+[TARGET]]

#model
training_pipeline = init_training(CATEGORICAL_COLS, NUMERICAL_COLS)
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_train[CATEGORICAL_COLS+NUMERICAL_COLS],
    preprocessed_train[TARGET],
    test_size=0.33,
    random_state=88,
)
rf_model = training_pipeline.fit(X_train, y_train)

## evaluate

In [None]:
accuracy = accuracy_score(rf_model.predict(X_test), y_test)
accuracy

## explain

In [None]:
ohe_cols_raw = (
    rf_model
    .named_steps["processor"]
    .named_transformers_["category"]
    .named_steps["one-hot"]
    .get_feature_names_out()
)
OHE_COLS = [
    col
    .replace("x0", "Sex")
    .replace("x1", "Embarked")
    .replace("x2", "Title")
    for col in ohe_cols_raw
]

In [None]:
feature_importance = {key: value for key, value in zip(NUMERICAL_COLS+OHE_COLS, rf_model.named_steps["regressor"].feature_importances_)}
feature_importance

## trubrics tests

In [None]:
from trubrics.utils.loader import get_business_test_data, save_test_to_json
from trubrics.mltests.robustness import test_single_edge_case
from trubrics.mltests.fairness import test_biased_performance_across_category
from trubrics.mltests.explainability import test_feature_in_top_n_important_features
from trubrics.mltests.performance import test_performance_against_threshold

In [None]:
RUNNER = "notebook"

In [None]:
# read test data and run single outlier test
data, expected_outcome = get_business_test_data(tracking=False)

test_single_edge_case(rf_model, data, expected_outcome, runner=RUNNER)

In [None]:
test_biased_performance_across_category(
    rf_model,
    X_test.assign(Survived=y_test),
    category="Sex",
    target=TARGET,
    runner=RUNNER,
    threshold=0.05,
)
test_biased_performance_across_category(
    rf_model,
    X_test.assign(Survived=y_test),
    category="Embarked",
    target=TARGET,
    runner="notebook",
    threshold=0.1,
)

In [None]:
test_feature_in_top_n_important_features(
    feature="Age",
    feature_importance=feature_importance,
    top_n_features=2,
    runner=RUNNER
)
test_feature_in_top_n_important_features(
    feature="Sex_female",
    feature_importance=feature_importance,
    runner=RUNNER
)

In [None]:
test_performance_against_threshold(
    model=rf_model,
    test_data=X_test.assign(Survived=y_test),
    evaluation_function=accuracy_score,
    target=TARGET,
    runner=RUNNER,
    threshold=0.75,
)
test_performance_against_threshold(
    model=rf_model,
    test_data=X_test.assign(Survived=y_test),
    evaluation_function=accuracy_score,
    target=TARGET,
    runner=RUNNER,
    threshold=0.8,
)

## save models

In [None]:
# save and load models
#joblib.dump(rf_model, '../models/rf_model.pkl')
#rf_model = joblib.load('../assets/models/rf_model.pkl')