In [18]:
import pandas as pd

df = pd.read_csv('../../Data/transactions/transaction_logs.csv')
unique_steps = df[['step']].drop_duplicates()
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Calculate the number of samples to retain for each category based on the original ratio
total_samples = len(df) // 10  # Downsample to 1/4th
fraud_ratio = len(fraud_df) / len(df)
fraud_samples = int(total_samples * fraud_ratio)
non_fraud_samples = total_samples - fraud_samples

# Randomly sample from each category
fraud_downsampled = fraud_df.sample(n=fraud_samples, random_state=1)
non_fraud_downsampled = non_fraud_df.sample(n=non_fraud_samples, random_state=1)
downsampled_df = pd.concat([fraud_downsampled, non_fraud_downsampled])

# Ensuring that each 'step' is represented at least once
downsampled_df = pd.concat([downsampled_df, unique_steps]).drop_duplicates()
downsampled_df = downsampled_df.sample(frac=1).reset_index(drop=True)

# Order by 'step'
downsampled_df = downsampled_df.sort_values(by='step')
downsampled_df.to_csv('downsampled_transaction_logs.csv', index=False)