# Loading data and requirements

In [18]:
!pip install -r requirements.txt



In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv('../paysim.csv')

In [21]:
# Assuming your DataFrame is named df
# Define the distribution percentages
percentage_isFraud_0 = 99.87
percentage_isFraud_1 = 0.13

# Calculate the number of instances for each category based on the desired percentages
total_sample_size = 20000  # You can adjust this as per your requirement
num_isFraud_0 = int(total_sample_size * percentage_isFraud_0 / 100)
num_isFraud_1 = total_sample_size - num_isFraud_0

# Generate the sample DataFrame with the desired distribution
sample_isFraud_0 = df[df['isFraud'] == 0].sample(n=num_isFraud_0, replace=True)
sample_isFraud_1 = df[df['isFraud'] == 1].sample(n=num_isFraud_1, replace=True)

# Concatenate the samples to form the final sample DataFrame
sample = pd.concat([sample_isFraud_0, sample_isFraud_1])

# Shuffle the rows to randomize the order
df = sample.sample(frac=1).reset_index(drop=True)

# Data preparation

In [22]:
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})

In [23]:
# Set new balance and original balance based on transaction amount based on EDA
# Percentage of observations with balance errors in the account giving money:  85.0
# Percentage of observations with balance errors in the account receiving money:  100.0

df['newbalanceDest'] = df['oldbalanceDest'] + df['amount']
df['oldbalanceOrig'] = df['newbalanceOrig'] + df['amount']

In [24]:
# # Only 6 true
# df['externalDest'] = ((df['oldbalanceDest'] == 0) & (df['newbalanceDest'] == 0)).astype(int)
# # Only 16 true
# df['externalOrig'] = ((df['oldbalanceOrig'] == 0) & (df['newbalanceOrig'] == 0)).astype(int)

# Feature engineering

In [25]:
# Extracting hour of the day from the 'step' column
df['hour'] = df['step']% 24

# Extracting day of the week as integers, add 3 to convert it to correct days of the week (1 = monday, 7 = sunday)
df['weekday'] = (df['step'] // 24) % 7 + 1

# Create is_workday feature based on the 2 least transaction dates being the weekend
df['is_workday'] = df['weekday'].apply(lambda x: 0 if x == 4 or x == 5 else 1)

# Extracting day of the week as integers
df['monthday'] = (df['step'] % 30) + 1

In [26]:
# calculate the rolling average of last 3 and 7 transactions for each recipient
df['meanDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['meanDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# calculate the rolling maximum of last 3 and 7 transactions for each recipient
df['maxDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).max().reset_index(0, drop=True)
df['maxDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).max().reset_index(0, drop=True)

In [27]:
# Create a new type column indicatin if transaction was from Customer (C) to Merchant (M) or any other combination

conditions = [
    (df['nameOrig'].str.contains('C')) & (df['nameDest'].str.contains('C')),
    (df['nameOrig'].str.contains('C')) & (df['nameDest'].str.contains('M')),
    (df['nameOrig'].str.contains('M')) & (df['nameDest'].str.contains('C')),
    (df['nameOrig'].str.contains('M')) & (df['nameDest'].str.contains('M'))
]

choices = ['CC', 'CM', 'MC', 'MM']

df['type2'] = np.select(conditions, choices, default=None)

In [28]:
# # One hot encode type columns
df = pd.get_dummies(df, columns=['type', 'type2'])

# Transformation

In [29]:
# Log scale amount
df['log_amount'] = np.log(df['amount'])

# ML preparation

In [30]:
# Remove unused columns
df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)

In [31]:
# move the 'isFraud' column to the end of the dataframe to become Y column
is_fraud_col = df.pop('isFraud')
df['isFraud'] = is_fraud_col

In [32]:
# Create train/val/test set following 0.7/0.15/0.15 split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=0, shuffle=True)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=0, shuffle=True)

In [33]:
# reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [34]:
save_directory = "../deep-symbolic-optimization/dso/dso/task/regression/data"

# Check if the directory exists, if not, create it
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save DataFrames to CSV files
train_df.to_csv(os.path.join(save_directory, "train_df.csv"), header = False, index=False)
test_df.to_csv(os.path.join(save_directory, "test_df.csv"), header = False, index=False)
val_df.to_csv(os.path.join(save_directory, "val_df.csv"), header = False, index=False)