In [1]:
import pandas as pd
import numpy as np

from fcalc.classifier import PatternBinaryClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

from tqdm import tqdm

In [2]:
def cross_val_score(method, alpha, X, y, n_splits, scoring):
    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
    scores = np.zeros((len(scoring), skf.get_n_splits()))

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        categorical = np.arange(X_test.shape[1])[X_test.dtypes=="object"]
        model = PatternBinaryClassifier(
            X_train.values, y_train.values,
            categorical=categorical,
            method=method, alpha=alpha
        )
        model.predict(X_test.values)

        # we always classify non prediction as false prediction
        tmp_predict = np.copy(model.predictions)
        tmp_predict[tmp_predict == -1] = np.abs(y_test.values[tmp_predict == -1] - 1)

        for i, score in enumerate(scoring):
            scores[i] = score(y_test.values, tmp_predict)
    
    return scores.mean(axis = 1).round(3)

Reading data

In [9]:
all_data = pd.read_csv('creditcard_2023.csv')
all_data.drop(columns = ['id'], inplace = True)
all_data, tmp = train_test_split(all_data, shuffle=True, random_state=42, stratify=all_data['Class'], train_size=1e-3)
X, y = all_data.drop(columns = 'Class'), all_data['Class']

CV grid search

In [None]:
GS = {}
best_f1 = [-np.inf, [-1, -1]]
best_acc = [-np.inf, [-1, -1]]
for method in ("standard", "standard-support", "ratio-support"):
    for alpha in np.linspace(0, 1, 11):
        tqdm.write(method + ", alpha = " + str(alpha))
        scores = cross_val_score(method, alpha, X, y, 5, [f1_score, accuracy_score])
        GS.setdefault(method, {})
        GS[method].setdefault(alpha, scores)

        if scores[0] >= best_f1[0]:
            best_f1[1] = [method, alpha]
            best_f1[0] = scores[0]

        if scores[1] >= best_acc[0]:
            best_acc[1] = [method, alpha]
            best_acc[0] = scores[1]

ratio-support, alpha = 1:  
accuracy = 0.496  
f1 = 0.663

Recalculate cv scores on all data

In [8]:
all_data = pd.read_csv('creditcard_2023.csv')
all_data.drop(columns = ['id'], inplace = True)
all_data, tmp = train_test_split(all_data, shuffle=True, random_state=42, stratify=all_data['Class'], train_size=3e-3)
X, y = all_data.drop(columns = 'Class'), all_data['Class']

method, alpha = "ratio-support", 1

In [9]:
scores = cross_val_score(method, alpha, X, y, 5, [f1_score, accuracy_score])

                                                         

In [10]:
scores

array([0.668, 0.501])

standard, alpha = 0  
f1 = 0.668 
accuracy = 0.501