In [None]:
import sys
sys.path.append('../')

import pandas as pd

from pathlib import Path

from lib.data_processing import FeatureEngineer
from lib.load_config import BaseConfig

In [None]:
ROOT_DIR = Path().cwd().parent
DATA_DIR = ROOT_DIR / 'data'

In [None]:
config = BaseConfig.load_config(ROOT_DIR / 'config' / 'config.yaml')
fraud_cases = pd.read_csv(DATA_DIR / 'fraud_cases.csv')
claim_history_train = pd.read_csv(DATA_DIR / 'FRISS_ClaimHistory_training.csv')

In [None]:
claim_history_train["claim_id_mapping"] = claim_history_train["sys_claimid"].astype(str).apply(lambda x: x.split("-")[1])

In [None]:
def match_fraud_cases(training_data: pd.DataFrame, fraud_cases: pd.DataFrame) -> pd.DataFrame:
    """Match fraud cases to training data
    
    Args:
        training_data: pd.DataFrame
        fraud_cases: pd.DataFrame
    
    Returns:
        training_data: pd.DataFrame
    """
    training_data['fraud_label'] = training_data['claim_id_mapping'].astype(str).isin(fraud_cases['ClaimID'].astype(str).to_numpy()).astype(int)
    return training_data

In [None]:
claim_history_train = match_fraud_cases(claim_history_train, fraud_cases)

In [None]:
fe = FeatureEngineer(
    cols_to_drop=config.preprocessing.cols_to_drop, 
    categorical_cols=config.preprocessing.categorical_cols,
    claim_occured_col=config.preprocessing.claim_occured_col,
    claim_reported_col=config.preprocessing.claim_reported_col,
    types_mapping=config.preprocessing.types_mapping,
)

In [None]:
fe.fit(claim_history_train)

In [None]:
claim_history_train = fe.transform(claim_history_train)

In [None]:
claim_history_train.dtypes