In [None]:
import pandas as pd
from tqdm import tqdm
import math
import os
import numpy as np

In [None]:
credit_data = pd.read_csv("../data/creditcard/creditcard.csv")

In [None]:
import pandas as pd
import numpy as np
import os
from math import floor

# Reset Class label as the top 1% amount as class 1
credit_data = credit_data.sort_values("Amount", ascending=False)
credit_data['Class'] = np.where(
    np.arange(len(credit_data)) < floor(len(credit_data) * 0.01), 1, 0
)

group_sizes = [20, 30, 50, 100, 200]
random_seeds = [42, 2024, 2025]

for fold, random_seed in zip(range(1, 4), random_seeds):
    # Consistent split per fold
    credit_data_shuffled = credit_data.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    total_samples = len(credit_data_shuffled)
    n_train = floor(total_samples * 0.7)
    n_val = floor(total_samples * 0.1)
    n_test = total_samples - n_train - n_val

    split_dict = {
        "train": credit_data_shuffled.iloc[:n_train].reset_index(drop=True),
        "val": credit_data_shuffled.iloc[n_train:n_train + n_val].reset_index(drop=True),
        "test": credit_data_shuffled.iloc[n_train + n_val:].reset_index(drop=True)
    }

    for group_size in group_sizes:
        for split_name, split_df in split_dict.items():
            split_class_0 = split_df[split_df["Class"] == 0]
            split_class_1 = split_df[split_df["Class"] == 1]

            n_groups = len(split_df) // group_size
            groups = []

            if len(split_class_1) >= n_groups:
                selected_class1 = split_class_1.sample(n=n_groups, random_state=random_seed)
                split_class_0_shuffled = split_class_0.sample(frac=1, random_state=random_seed).reset_index(drop=True)
                selected_class1 = selected_class1.reset_index(drop=True)

                for i in range(n_groups):
                    start_idx = i * (group_size - 1)
                    end_idx = start_idx + (group_size - 1)
                    group = pd.concat([
                        selected_class1.iloc[[i]],
                        split_class_0_shuffled.iloc[start_idx:end_idx]
                    ])
                    group["qid"] = i
                    groups.append(group.sample(frac=1, random_state=random_seed))
            else:
                # Not enough Class 1 to guarantee, random grouping
                data_shuffled = split_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
                for i in range(n_groups):
                    start_idx = i * group_size
                    end_idx = start_idx + group_size
                    group = data_shuffled.iloc[start_idx:end_idx]
                    group["qid"] = i
                    groups.append(group)

            grouped_split_df = pd.concat(groups).reset_index(drop=True)

            print(f"\nFold: {fold}, Group Size: {group_size}, Split: {split_name}")
            print(f" Samples: {len(grouped_split_df)}")
            print(" Class distribution:", grouped_split_df["Class"].value_counts().to_dict())

            os.makedirs(f"../data/creditcard/fold{fold}/{group_size}", exist_ok=True)
            grouped_split_df.to_csv(f"../data/creditcard/fold{fold}/{group_size}/{split_name}.csv", index=False)
