In [None]:
import os
import pandas as pd
import numpy as np

# ==============================================================================
#   USER CONFIGURATION SECTION
# ==============================================================================

# 1. PATH SETTINGS
# ------------------------------------------------------------------------------
INPUT_FILE = "/path/to/creditcard.csv"
SAVE_DIR = "/path/to/output_directory"

# 2. COLUMN SETTINGS
# ------------------------------------------------------------------------------
LABEL_COL = "Class"  # 1 = Fraud, 0 = Genuine
TIME_COL = "Time"    # Numerical timestamp column

# 3. SPLIT STRATEGY
# ------------------------------------------------------------------------------
# Ratio of fraud samples to allocate to the Test set (e.g., 0.2 = 20%)
# The script finds the timestamp that splits the fraud cases 80/20 
# and uses that timestamp to split the entire dataset.
TEST_FRAUD_RATIO = 0.2 

# 4. OUTPUT FILENAMES
# ------------------------------------------------------------------------------
OUT_TRAIN = "train_creditcard_timesplit.csv"
OUT_TEST = "test_creditcard_timesplit.csv"

# ==============================================================================
#   MAIN PIPELINE
# ==============================================================================

def main():
    # --- 1. Load Data ---
    print(f"=== Loading Data ===")
    if not os.path.exists(INPUT_FILE):
        raise FileNotFoundError(f"File not found: {INPUT_FILE}")

    os.makedirs(SAVE_DIR, exist_ok=True)
    df = pd.read_csv(INPUT_FILE)
    print(f"Initial Rows: {len(df):,}, Cols: {len(df.columns)}")

    # --- 2. Validation & Cleaning ---
    if LABEL_COL not in df.columns or TIME_COL not in df.columns:
        raise ValueError(f"Missing required columns: {LABEL_COL} or {TIME_COL}")

    # Fill NaNs with 0 (Standard for this dataset if any exist)
    n_null = df.isnull().sum().sum()
    if n_null > 0:
        print(f"Warning: Filled {n_null} NaN values with 0.")
        df.fillna(0, inplace=True)

    # --- 3. Time-Based Stratified Splitting ---
    print("\n=== Executing Time-Anchored Split ===")
    
    # A. Sort entire dataset by time
    df = df.sort_values(by=TIME_COL).reset_index(drop=True)
    
    # B. Isolate Fraud samples
    fraud_df = df[df[LABEL_COL] == 1]
    total_frauds = len(fraud_df)
    print(f"Total Fraud Samples: {total_frauds}")

    if total_frauds > 0:
        # C. Calculate split point based on Fraud count
        n_test_fraud = int(total_frauds * TEST_FRAUD_RATIO)
        
        # Ensure at least 1 fraud sample in test if frauds exist
        if n_test_fraud == 0: 
            n_test_fraud = 1
            
        # D. Find the specific timestamp that separates the last 20% of frauds
        # Get the last N fraud samples
        test_fraud_subset = fraud_df.tail(n_test_fraud)
        # The split timestamp is the time of the *first* transaction in this subset
        split_timestamp = test_fraud_subset[TIME_COL].min()
        
        print(f"Targeting {n_test_fraud} fraud samples for Test set.")
        print(f"Calculated Split Timestamp: {split_timestamp}")

        # E. Split the WHOLE dataset based on this timestamp
        train_df = df[df[TIME_COL] < split_timestamp]
        test_df = df[df[TIME_COL] >= split_timestamp]
        
    else:
        # Fallback if dataset has no fraud labels (rare edge case)
        print("Warning: No fraud labels found. Doing simple sequential split.")
        split_idx = int(len(df) * (1 - TEST_FRAUD_RATIO))
        train_df = df.iloc[:split_idx]
        test_df = df.iloc[split_idx:]

    # --- 4. Statistics & Save ---
    print(f"\nTrain set: {len(train_df):,} rows")
    print(f"Test set:  {len(test_df):,} rows")
    
    # Calculate Fraud Ratios
    train_fraud_rate = train_df[LABEL_COL].mean()
    test_fraud_rate = test_df[LABEL_COL].mean()
    
    print(f"Fraud Rate (Train): {train_fraud_rate:.4%} ({train_df[LABEL_COL].sum()} cases)")
    print(f"Fraud Rate (Test):  {test_fraud_rate:.4%} ({test_df[LABEL_COL].sum()} cases)")

    train_path = os.path.join(SAVE_DIR, OUT_TRAIN)
    test_path = os.path.join(SAVE_DIR, OUT_TEST)

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print("\n=== Processing Complete ===")
    print(f"Saved Train: {train_path}")
    print(f"Saved Test:  {test_path}")

if __name__ == "__main__":
    main()