### Consider:
* Engineer location-related features
* Group on user. Produce features for (mean amount) and (amount)/(mean amount)
* SHAP
* Branch with polars

In [None]:
from kagglehub import dataset_load, KaggleDatasetAdapter

import numpy as np
import pandas as pd
import seaborn as sns
import statistics as st

from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
transactions = dataset_load(
    KaggleDatasetAdapter.PANDAS,
    'ealtman2019/credit-card-transactions',
    'credit_card_transactions-ibm_v2.csv',
)

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    
    # Format column names: lowercase all characters and remove non-alphabetics
    names = {}
    for col in df.columns:
        names[col] = ''.join(char.lower() for char in col if char.isalpha())
    df = df.rename(columns=names)

    # Parse hour and minute data from time column
    df['hour'] = df['time'].str[0:2].astype('int64')
    df['minute'] = df['time'].str[3:5].astype('int64')

    # Convert amount to float
    df['amount'] = df['amount'].str.replace('$', '').astype('float')

    # Convert isfraud to binary
    df['isfraud'] = df['isfraud'].apply(lambda x: 1 if x == 'Yes' else 0)

    return df.drop(columns = ['user', 'card', 'time'])

In [None]:
def cv_fit_model(model, X: pd.DataFrame, y: pd.Series, n_splits: int, encode_cols: list[str]) -> pd.DataFrame:

    kf = KFold(n_splits = n_splits)
    for train_ind, test_ind in kf.split(X):

        # Separate X and y
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, y_test = X.iloc[test_ind], y.iloc[test_ind]

        # --- Target Encoding ---
        # 1. Initialize the encoder
        encoder = TargetEncoder(target_type='binary') 
        
        # 2. Fit the encoder on the training data
        encoder.fit(X_train[encode_cols], y_train)

        # 3. Transform both train and test data with the same fitted encoder
        X_train_encoded = encoder.transform(X_train[encode_cols])
        X_test_encoded = encoder.transform(X_test[encode_cols])

        # 4. Create new column names for the encoded features
        encoded_col_names = [f'target_encoded_{col}' for col in encode_cols]

        # 5. Add encoded features as new columns to X_train and X_test
        X_train.loc[:, encoded_col_names] = X_train_encoded
        X_test.loc[:, encoded_col_names] = X_test_encoded

        # 6. Drop the original categorical columns
        X_train = X_train.drop(columns=encode_cols)
        X_test = X_test.drop(columns=encode_cols)
        # --- End Encoding ---

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        data = {
            'model': model,
            'c_report': classification_report(y_test, y_pred),
            'con_matrix': confusion_matrix(model.predict(X_test), y_test),
            'roc_curve': roc_curve(model.predict(X_test), y_test)
        }
    
    return pd.DataFrame(data = data)

In [None]:
'''
def median_model(model_scores: tuple[list, list]):
    # Return model with median performance
    median_score = st.median(model_scores[2])
    model_ind = model_scores[2].index(median_score)
    return model_scores[1][model_ind]
'''

In [None]:
transactions_processed = preprocess(transactions).sample(frac=1)

# Columns to target encode
encode_cols = ['usechip', 'merchantname', 'merchantcity', 'merchantstate', 'zip', 'mcc', 'errors']

In [None]:
# Decision tree model
tree_model_scores = cv_fit_model(
    DecisionTreeClassifier(class_weight='balanced', ccp_alpha=0.01),
    transactions_processed.drop(columns='isfraud'),
    transactions_processed['isfraud'],
    n_splits=5,
    encode_cols=encode_cols
)

In [None]:
# Most significant splits:
# x[11]: target_encoded_zip
# x[8]: target_encoded_merchantname
# x[17]: target_encoded_merchantcity

# plot_tree(median_model(tree_model_scores))

In [None]:
'''
# Histogram gradient boosting tree model
hgb_model_scores = cv_fit_model(
    HistGradientBoostingClassifier(validation_fraction = None),
    transactions_processed_balanced,
    target = 'isfraud',
    n_splits = 5
)
'''