### Consider:
* Balance the data for faster training
* Remove features with low correlation to isfraud
* Group on user. Produce features for (mean amount) and (amount)/(mean amount)
* Refactor preprocessing
* Perform evaluation (CM, ROC-AUC) on holdout
* SHAP
* Create a branch with polars. Compare compute performance

In [None]:
from kagglehub import dataset_load, KaggleDatasetAdapter

import numpy as np
import pandas as pd
import seaborn as sns
import statistics as st

from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split, KFold
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay
# from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
# from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
transactions = dataset_load(
    KaggleDatasetAdapter.PANDAS,
    'ealtman2019/credit-card-transactions',
    'credit_card_transactions-ibm_v2.csv',
)

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    
    # Format column names: lowercase all characters and remove non-alphabetics
    names = {}
    for col in df.columns:
        names[col] = ''.join(char.lower() for char in col if char.isalpha())
    df = df.rename(columns=names)

    # Parse hour and minute data from time column
    df['hour'] = df['time'].str[0:2].astype('int64')
    df['minute'] = df['time'].str[3:5].astype('int64')

    # Convert amount to float
    df['amount'] = df['amount'].str.replace('$', '').astype('float')

    # Convert isfraud to binary
    df['isfraud'] = df['isfraud'].apply(lambda x: 1 if x == 'Yes' else 0)

    return df.drop(columns = ['user', 'card', 'time'])

'''
def balance_data(df: pd.DataFrame) -> pd.DataFrame:
    # Sample the data, producing a dataset balanced on isfraud
    num_fraud = df.loc[df['isfraud'] == 1].shape[0]
    fraudulent = df.loc[df['isfraud'] == 1].sample(num_fraud)
    not_fraudulent = df.loc[df['isfraud'] == 0].sample(num_fraud)
    return pd.concat([fraudulent, not_fraudulent])
'''

In [None]:
transactions_processed = preprocess(transactions)

X_train, X_holdout, y_train, y_holdout = train_test_split(
    transactions_processed.drop(columns='isfraud'),
    transactions_processed['isfraud'],
    test_size=0.2
)

In [None]:
def cv_fit_model(model, X: pd.DataFrame, y: pd.Series, n_splits: int) -> tuple[list, list]:
    models, scores = [], []
    kf = KFold(n_splits = n_splits)
    
    # Columns to target encode
    encode_cols = ['usechip', 'errors', 'merchantname', 'merchantcity', 'merchantstate', 'zip', 'mcc']
    
    for train_ind, test_ind in kf.split(X):

        # Separate X and y
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, y_test = X.iloc[test_ind], y.iloc[test_ind]

        # --- Target Encoding ---
        # 1. Initialize the encoder
        encoder = TargetEncoder(target_type='binary') 
        
        # 2. Fit the encoder on the training data
        encoder.fit(X_train[encode_cols], y_train)

        # 3. Transform both train and test data with the same fitted encoder
        X_train_encoded = encoder.transform(X_train[encode_cols])
        X_test_encoded = encoder.transform(X_test[encode_cols])

        # 4. Create new column names for the encoded features
        encoded_col_names = [f'target_encoded_{col}' for col in encode_cols]

        # 5. Add encoded features as new columns to X_train and X_test
        # X_train[encoded_col_names] = X_train_encoded
        X_train.loc[:, encoded_col_names] = X_train_encoded
        #X_test[encoded_col_names] = X_test_encoded
        X_test.loc[:, encoded_col_names] = X_test_encoded

        # 6. Drop the original categorical columns
        X_train = X_train.drop(columns=encode_cols)
        X_test = X_test.drop(columns=encode_cols)
        # --- End Encoding ---

        model.fit(X_train, y_train)
        models.append(model)
        score = model.score(X_test, y_test)
        scores.append(score)
        
    return (models, scores)

In [None]:
'''
def median_model_ind(model_scores: tuple[list, list]) -> int:
    # Return model with median performance
    median_score = st.median(model_scores[1])
    return model_scores[1].index(median_score)
'''

In [None]:
# Decision tree model
tree_model_scores = cv_fit_model(
    DecisionTreeClassifier(ccp_alpha=0.01),
    X_train,
    y_train,
    n_splits=5
)

In [None]:
tree_model_scores[1]

In [None]:
# Most significant splits:
# x[19]: target_encoded_zip
# x[17]: target_encoded_merchantcity
# x[16]: target_encoded_merchantname
# x[20]: target_encoded_mcc
# target_encoding(transactions_processed_balanced, target = 'isfraud')[0].columns[19]
# plot_tree(median_model(tree_model_scores))

In [None]:
# Logistic regression model
'''
logistic_model_scores = cv_fit_model(
    LogisticRegression(class_weight='balanced', solver='newton-cholesky'),
    transactions_processed.drop(columns='isfraud'),
    transactions_processed['isfraud'],
    n_splits = 5
)
'''

In [None]:
'''
# Histogram gradient boosting tree model
hgb_model_scores = cv_fit_model(
    HistGradientBoostingClassifier(validation_fraction = None),
    transactions_processed_balanced,
    target = 'isfraud',
    n_splits = 5
)
'''