In [None]:
%env MLFLOW_TRACKING_URI=sqlite:///../data/mlruns.db

In [None]:
import mlflow
import sys
sys.path.append('../')

import pandas as pd

from pathlib import Path

from lib.data_processing import FeatureEngineer
from lib.load_config import BaseConfig
from lib.model import FraudDetectionModel

In [None]:
ROOT_DIR = Path().cwd().parent
DATA_DIR = ROOT_DIR / 'data'

In [None]:
config = BaseConfig.load_config(ROOT_DIR / 'config' / 'config.yaml')
fraud_cases = pd.read_csv(DATA_DIR / 'fraud_cases.csv')
claim_history_train = pd.read_csv(DATA_DIR / 'FRISS_ClaimHistory_training.csv')
claim_history_test = pd.read_csv(DATA_DIR / 'FRISS_ClaimHistory_test.csv')

In [None]:
def match_fraud_cases(training_data: pd.DataFrame, fraud_cases: pd.DataFrame) -> pd.DataFrame:
    """Match fraud cases to training data
    
    Args:
        training_data: pd.DataFrame
        fraud_cases: pd.DataFrame
    
    Returns:
        training_data: pd.DataFrame
    """
    training_data['fraud_label'] = training_data['claim_id_mapping'].astype(str).isin(fraud_cases['ClaimID'].astype(str).to_numpy()).astype(int)
    return training_data

In [None]:
claim_history_train["claim_id_mapping"] = claim_history_train["sys_claimid"].astype(str).apply(lambda x: x.split("-")[1])
claim_history_train = match_fraud_cases(claim_history_train, fraud_cases)

In [None]:
fe = FeatureEngineer(
    cols_to_drop=config.preprocessing.cols_to_drop, 
    categorical_cols=config.preprocessing.categorical_cols,
    claim_occured_col=config.preprocessing.claim_occured_col,
    claim_reported_col=config.preprocessing.claim_reported_col,
    types_mapping=config.preprocessing.types_mapping,
)

In [None]:
fe.fit(claim_history_train)

In [None]:
claim_history_train = fe.transform(claim_history_train)
claim_history_test = fe.transform(claim_history_test)

In [None]:
claim_history_train["fraud_label"].value_counts() / claim_history_train.shape[0]

In [None]:
claim_history_test["sys_fraud"].value_counts() / claim_history_test.shape[0]

In [None]:
scale_pos_weight = (claim_history_train["fraud_label"].value_counts()[0] / claim_history_train["fraud_label"].value_counts()[1]) ** 0.5

In [None]:
mlflow.set_experiment("fraud_detection_experiment")

with mlflow.start_run():
    run_id = mlflow.active_run().info.run_id

    mlflow.log_params(dict(config))
    mlflow.log_param("scale_pos_weight", scale_pos_weight)

    model = FraudDetectionModel(
        scale_pos_weight=scale_pos_weight,
        model_params=config.model.params,
        hp_config=config.model.hp_tuning,
    )

    X_train = claim_history_train.drop(columns=["fraud_label"])
    y_train = claim_history_train["fraud_label"]
    X_test = claim_history_test.drop(columns=["sys_fraud"])
    y_test = claim_history_test["sys_fraud"]

    model.train(X=X_train, y=y_train)
    evaluation = model.evaluate(X_test=X_test, y_test=y_test)

    mlflow.log_metric("auc_roc", evaluation["auc_roc"])
    mlflow.xgboost.log_model(model.model, "model")

    fe_filepath = ROOT_DIR / 'models' / 'feature_engineering' / f'feature_engineer_{run_id}.pkl'
    model_filepath = ROOT_DIR / 'models' / 'fraud_detection' / f'fraud_detection_model_{run_id}.pkl'
    
    fe.save_model(fe_filepath)
    model.save_model(model_filepath)

    mlflow.log_artifact(fe_filepath, artifact_path="feature_engineering")
    mlflow.log_artifact(model_filepath, artifact_path="fraud_detection")
