### Consider:
* Engineer new features related to high-importance features
* Group on user. Produce features for (mean amount) and (amount)/(mean amount)
* SHAP
* Branch with polars

In [None]:
from kagglehub import dataset_load, KaggleDatasetAdapter
import pandas as pd
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def preprocess(df:pd.DataFrame) -> pd.DataFrame:
    
    # Format column names: lowercase all characters and remove non-alphabetics
    names = {}
    for col in df.columns:
        names[col] = ''.join(char.lower() for char in col if char.isalpha())
    df = df.rename(columns=names)

    # Parse hour and minute data from time column
    df['hour'] = df['time'].str[0:2].astype('int64')
    df['minute'] = df['time'].str[3:5].astype('int64')

    # Convert amount to float
    df['amount'] = df['amount'].str.replace('$', '').astype('float')

    # Convert isfraud to binary
    df['isfraud'] = df['isfraud'].apply(lambda x: 1 if x=='Yes' else 0)

    return df.drop(columns=['user', 'card', 'time'])


def balance(df:pd.DataFrame) -> pd.DataFrame:
    fraud = df.loc[df['isfraud']==1]
    not_fraud = df.loc[df['isfraud']==0].sample(fraud.shape[0])
    return pd.concat([fraud, not_fraud])

In [None]:
def encode_train(model, X:pd.DataFrame, y:pd.Series, n_splits:int, encode_cols:list[str]) -> list[tuple]:
    model_scores = []

    kf = KFold(n_splits)
    for train_ind, test_ind in kf.split(X):

        # Separate X and y
        X_train, y_train = X.iloc[train_ind].copy(), y.iloc[train_ind].copy()
        X_test, y_test = X.iloc[test_ind].copy(), y.iloc[test_ind].copy()

        # --- Target Encoding ---
        # 1. Initialize the encoder
        encoder = TargetEncoder(target_type='binary') 
        
        # 2. Fit the encoder on the training data
        encoder.fit(X_train[encode_cols], y_train)

        # 3. Transform both train and test data with the same fitted encoder
        X_train_encoded = encoder.transform(X_train[encode_cols])
        X_test_encoded = encoder.transform(X_test[encode_cols])

        # 4. Create new column names for the encoded features
        encoded_col_names = [f'target_encoded_{col}' for col in encode_cols]

        # 5. Add encoded features as new columns to X_train and X_test
        X_train.loc[:, encoded_col_names] = X_train_encoded
        X_test.loc[:, encoded_col_names] = X_test_encoded

        # 6. Drop the original categorical columns
        X_train = X_train.drop(columns=encode_cols)
        X_test = X_test.drop(columns=encode_cols)
        # --- End Encoding ---

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        importances = [(X_train.columns[i], imp) for i, imp in enumerate(model.feature_importances_) if imp > 0]

        model_scores.append(
            (precision_score(y_test, y_pred, average='binary'),
            recall_score(y_test, y_pred, average='binary'),
            importances)
        )

    return model_scores

In [None]:
X = dataset_load(
    KaggleDatasetAdapter.PANDAS,
    'ealtman2019/credit-card-transactions',
    'credit_card_transactions-ibm_v2.csv',
)

In [None]:
preprocess(X).columns

In [None]:
X_bal = balance(preprocess(X))

X_train, X_val, y_train, y_val = train_test_split(X_bal.drop(columns='isfraud'), X_bal['isfraud'])

encode_cols = ['usechip', 'merchantname', 'merchantcity', 'merchantstate', 'zip', 'mcc', 'errors']

In [None]:
# Histogram gradient boosting tree model
# To examine feature importances, allow to grow unregularized
gb_scores = encode_train(
    GradientBoostingClassifier(),
    X_train,
    y_train,
    n_splits=5,
    encode_cols=encode_cols
)

In [None]:
# target_encoded_zip and target_encoded_merchantname are by far the most important features
gb_scores[0][2]