# Loading data and requirements

In [66]:
!pip install -r requirements.txt



In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [68]:
df = pd.read_csv('../data/paysim.csv')

# Data preparation

In [70]:
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})

In [71]:
# Set new balance and original balance based on transaction amount based on EDA
# Percentage of observations with balance errors in the account giving money:  85.0
# Percentage of observations with balance errors in the account receiving money:  100.0

df['newbalanceDest'] = df['oldbalanceDest'] + df['amount']
df['oldbalanceOrig'] = df['newbalanceOrig'] + df['amount']

# Feature engineering

In [7]:
transactions = df.shape[0]

def add_noise(df, column, noise_level=0.05):
    noise = np.random.normal(0, noise_level * df[column].std(), size=df[column].shape)
    df[column] += noise

add_noise(df, 'amount')
add_noise(df, 'oldbalanceOrig')
add_noise(df, 'newbalanceOrig')
add_noise(df, 'oldbalanceDest')
add_noise(df, 'newbalanceDest')

Unnamed: 0,step,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,...,monthday,meanDest3,meanDest7,maxDest3,maxDest7,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,9839.64,0,0,...,2,9.839640e+03,9.839640e+03,9839.64,9839.64,0,0,0,1,0
1,1,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,1864.28,0,0,...,2,1.864280e+03,1.864280e+03,1864.28,1864.28,0,0,0,1,0
2,1,181.00,C1305486145,181.00,0.00,C553264065,0.00,181.00,1,0,...,2,1.810000e+02,1.810000e+02,181.00,181.00,0,0,0,0,1
3,1,181.00,C840083671,181.00,0.00,C38997010,21182.00,21363.00,1,0,...,2,1.810000e+02,1.810000e+02,181.00,181.00,0,1,0,0,0
4,1,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,11668.14,0,0,...,2,1.166814e+04,1.166814e+04,11668.14,11668.14,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0,...,24,3.323914e+05,3.323914e+05,339682.13,339682.13,0,1,0,0,0
6362616,743,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,6311409.28,1,0,...,24,6.311409e+06,6.311409e+06,6311409.28,6311409.28,0,0,0,0,1
6362617,743,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.12,1,0,...,24,2.212471e+06,2.212471e+06,6311409.28,6311409.28,0,1,0,0,0
6362618,743,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,850002.52,1,0,...,24,8.500025e+05,8.500025e+05,850002.52,850002.52,0,0,0,0,1


In [73]:
# Extracting hour of the day from the 'step' column
df['hour'] = df['step']% 24

# Extracting day of the week as integers, add 3 to convert it to correct days of the week (1 = monday, 7 = sunday)
df['weekday'] = (df['step'] // 24) % 7 + 1

# Create is_workday feature based on the 2 least transaction dates being the weekend
df['is_workday'] = df['weekday'].apply(lambda x: 0 if x == 4 or x == 5 else 1)

# Extracting day of the week as integers
df['monthday'] = (df['step'] % 30) + 1

In [None]:
# calculate the rolling average of last 3 and 7 transactions for each recipient
df['meanDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['meanDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# calculate the rolling maximum of last 3 and 7 transactions for each recipient
df['maxDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).max().reset_index(0, drop=True)
df['maxDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).max().reset_index(0, drop=True)

# Transformation

In [23]:
# # One hot encode type columns
df = pd.get_dummies(df, columns=['type'])

In [24]:
columns = ['oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'amount', 'meanDest3', 'meanDest7', 'maxDest3', 'maxDest7']
def create_fuzzy_sets_percentile(column_data, percentiles):
    fuzzy_sets = []
    for value in column_data:
        if value <= percentiles[0.2]:
            fuzzy_sets.append(0.2)
        elif value <= percentiles[0.4]:
            fuzzy_sets.append(0.4)
        elif value <= percentiles[0.6]:
            fuzzy_sets.append(0.6)
        elif value <= percentiles[0.8]:
            fuzzy_sets.append(0.8)
        else:
            fuzzy_sets.append(1.0)
    return fuzzy_sets

# Calculate percentiles for each column
percentiles = df.quantile([0.2, 0.4, 0.6, 0.8]).to_dict()

# Create fuzzy sets for each column
fuzzy_sets_percentile = {}
for column in columns:
    fuzzy_sets_percentile[column] = create_fuzzy_sets_percentile(df[column], percentiles[column])

# Overwrite the values in the DataFrame with the new fuzzy sets values
for column in columns:
    df[column] = fuzzy_sets_percentile[column]

Unnamed: 0,step,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,...,monthday,meanDest3,meanDest7,maxDest3,maxDest7,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,0.2,C1231006815,0.6,0.8,M1979787155,0.2,0.2,0,0,...,2,0.2,0.2,0.2,0.2,0,0,0,1,0
1,1,0.2,C1666544295,0.2,0.8,M2044282225,0.2,0.2,0,0,...,2,0.2,0.2,0.2,0.2,0,0,0,1,0
2,1,0.2,C1305486145,0.2,0.2,C553264065,0.2,0.2,1,0,...,2,0.2,0.2,0.2,0.2,0,0,0,0,1
3,1,0.2,C840083671,0.2,0.2,C38997010,0.6,0.4,1,0,...,2,0.2,0.2,0.2,0.2,0,1,0,0,0
4,1,0.4,C2048537720,0.4,0.8,M1230701703,0.2,0.4,0,0,...,2,0.4,0.4,0.4,0.4,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,1.0,C786484425,0.8,0.2,C776919290,0.2,0.6,1,0,...,24,1.0,1.0,0.8,0.8,0,1,0,0,0
6362616,743,1.0,C1529008245,1.0,0.2,C1881841831,0.2,1.0,1,0,...,24,1.0,1.0,1.0,1.0,0,0,0,0,1
6362617,743,1.0,C1162922333,1.0,0.2,C1365125890,0.6,1.0,1,0,...,24,1.0,1.0,1.0,1.0,0,1,0,0,0
6362618,743,1.0,C1685995037,1.0,0.2,C2080388513,0.2,0.8,1,0,...,24,1.0,1.0,1.0,1.0,0,0,0,0,1


# ML preparation

In [25]:
# Remove unused columns
df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'amount', 'hour', 'weekday', 'monthday', 'step'], axis=1, inplace=True)

In [26]:
#Relocate fraud column to back of df
fraud_col = df.pop('isFraud')
df['isFraud'] = fraud_col

In [31]:
df = pd.read_csv('../data/final_df.csv')
df

Unnamed: 0,oldbalanceOrig,newbalanceOrig,oldbalanceDest,newbalanceDest,externalDest,externalOrig,is_workday,meanDest3,meanDest7,maxDest3,maxDest7,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud
0,0.6,0.8,0.2,0.2,0,0,1,0.2,0.2,0.2,0.2,0,0,0,1,0,0
1,0.2,0.8,0.2,0.2,0,0,1,0.2,0.2,0.2,0.2,0,0,0,1,0,0
2,0.2,0.2,0.2,0.2,0,0,1,0.2,0.2,0.2,0.2,0,0,0,0,1,1
3,0.2,0.2,0.6,0.4,0,0,1,0.2,0.2,0.2,0.2,0,1,0,0,0,1
4,0.4,0.8,0.2,0.4,0,0,1,0.4,0.4,0.4,0.4,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,0.8,0.2,0.2,0.6,0,0,1,1.0,1.0,0.8,0.8,0,1,0,0,0,1
6362616,1.0,0.2,0.2,1.0,0,0,1,1.0,1.0,1.0,1.0,0,0,0,0,1,1
6362617,1.0,0.2,0.6,1.0,0,0,1,1.0,1.0,1.0,1.0,0,1,0,0,0,1
6362618,1.0,0.2,0.2,0.8,0,0,1,1.0,1.0,1.0,1.0,0,0,0,0,1,1


In [48]:
# Define the distribution percentages
percentage_isFraud_0 = 99.87
percentage_isFraud_1 = 0.13

# Calculate the number of instances for each category based on the desired percentages
total_sample_size = 1000000  # You can adjust this as per your requirement
num_isFraud_0 = int(total_sample_size * percentage_isFraud_0 / 100)
num_isFraud_1 = total_sample_size - num_isFraud_0

# Generate the sample DataFrame with the desired distribution
sample_isFraud_0 = df[df['isFraud'] == 0].sample(n=num_isFraud_0, replace=True)
sample_isFraud_1 = df[df['isFraud'] == 1].sample(n=num_isFraud_1, replace=True)

# Concatenate the samples to form the final sample DataFrame
sample = pd.concat([sample_isFraud_0, sample_isFraud_1])

# Shuffle the rows to randomize the order
df = sample.sample(frac=1).reset_index(drop=True)

In [49]:
from sklearn.model_selection import train_test_split

# Create train/val/test set following 0.8/0.1/0.1 split with balanced classes
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['isFraud'], random_state=42, shuffle=True)

In [50]:
# reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [51]:
import os
# save_directory = "../deep-symbolic-optimization/dso/dso/task/regression/data"

save_directory = "../data"

# Check if the directory exists, if not, create it
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save DataFrames to CSV files
train_df.to_csv(os.path.join(save_directory, "1m_train.csv"), header = False, index=False)
test_df.to_csv(os.path.join(save_directory, "1m_test.csv"), header = False, index=False)