# Loading data and requirements

In [20]:
!pip install -r requirements.txt



In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [22]:
df = pd.read_csv('../paysim.csv')

In [23]:
# # Assuming your DataFrame is named df
# # Define the distribution percentages
# percentage_isFraud_0 = 99.87
# percentage_isFraud_1 = 0.13

# # Calculate the number of instances for each category based on the desired percentages
# total_sample_size = 5000  # You can adjust this as per your requirement
# num_isFraud_0 = int(total_sample_size * percentage_isFraud_0 / 100)
# num_isFraud_1 = total_sample_size - num_isFraud_0

# # Generate the sample DataFrame with the desired distribution
# sample_isFraud_0 = df[df['isFraud'] == 0].sample(n=num_isFraud_0, replace=True)
# sample_isFraud_1 = df[df['isFraud'] == 1].sample(n=num_isFraud_1, replace=True)

# # Concatenate the samples to form the final sample DataFrame
# sample = pd.concat([sample_isFraud_0, sample_isFraud_1])

# # Shuffle the rows to randomize the order
# df = sample.sample(frac=1).reset_index(drop=True)

# Data preparation

In [24]:
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})

In [25]:
# Set new balance and original balance based on transaction amount based on EDA
# Percentage of observations with balance errors in the account giving money:  85.0
# Percentage of observations with balance errors in the account receiving money:  100.0

df['newbalanceDest'] = df['oldbalanceDest'] + df['amount']
df['oldbalanceOrig'] = df['newbalanceOrig'] + df['amount']

In [26]:
# # Only 6 true
# df['externalDest'] = ((df['oldbalanceDest'] == 0) & (df['newbalanceDest'] == 0)).astype(int)
# # Only 16 true
# df['externalOrig'] = ((df['oldbalanceOrig'] == 0) & (df['newbalanceOrig'] == 0)).astype(int)

# Feature engineering

In [27]:
# Extracting hour of the day from the 'step' column
df['hour'] = df['step']% 24

# Extracting day of the week as integers, add 3 to convert it to correct days of the week (1 = monday, 7 = sunday)
df['weekday'] = (df['step'] // 24) % 7 + 1

# Create is_workday feature based on the 2 least transaction dates being the weekend
df['is_workday'] = df['weekday'].apply(lambda x: 0 if x == 4 or x == 5 else 1)

# Extracting day of the week as integers
df['monthday'] = (df['step'] % 30) + 1

In [28]:
# calculate the rolling average of last 3 and 7 transactions for each recipient
df['meanDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['meanDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# calculate the rolling maximum of last 3 and 7 transactions for each recipient
df['maxDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).max().reset_index(0, drop=True)
df['maxDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).max().reset_index(0, drop=True)

In [29]:
# Create a new type column indicatin if transaction was from Customer (C) to Merchant (M) or any other combination

conditions = [
    (df['nameOrig'].str.contains('C')) & (df['nameDest'].str.contains('C')),
    (df['nameOrig'].str.contains('C')) & (df['nameDest'].str.contains('M')),
    (df['nameOrig'].str.contains('M')) & (df['nameDest'].str.contains('C')),
    (df['nameOrig'].str.contains('M')) & (df['nameDest'].str.contains('M'))
]

choices = ['CC', 'CM', 'MC', 'MM']

df['type2'] = np.select(conditions, choices, default=None)

In [30]:
# # One hot encode type columns
df = pd.get_dummies(df, columns=['type', 'type2'])

# Transformation

In [31]:
# Log scale amount
df['log_amount'] = np.log(df['amount'])
df = df[['log_amount'] + [col for col in df.columns if col != 'log_amount']]

In [32]:
columns = ['log_amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'is_workday', 'meanDest3', 'meanDest7', 'maxDest3', 'maxDest7']
def create_fuzzy_sets_percentile(column_data, percentiles):
    fuzzy_sets = []
    for value in column_data:
        if value <= percentiles[0.2]:
            fuzzy_sets.append(0.0)
        elif value <= percentiles[0.4]:
            fuzzy_sets.append(0.3)
        elif value <= percentiles[0.6]:
            fuzzy_sets.append(0.5)
        elif value <= percentiles[0.8]:
            fuzzy_sets.append(0.7)
        else:
            fuzzy_sets.append(1.0)
    return fuzzy_sets

# Calculate percentiles for each column
percentiles = df.quantile([0.2, 0.4, 0.6, 0.8]).to_dict()

# Create fuzzy sets for each column
fuzzy_sets_percentile = {}
for column in columns:
    fuzzy_sets_percentile[column] = create_fuzzy_sets_percentile(df[column], percentiles[column])

# Overwrite the values in the DataFrame with the new fuzzy sets values
for column in columns:
    df[column] = fuzzy_sets_percentile[column]

# Print or use the DataFrame with fuzzy sets values
df

Unnamed: 0,log_amount,step,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,meanDest7,maxDest3,maxDest7,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,type2_CC,type2_CM
0,0.7,283,244722.58,C202779559,0.7,0.0,C1800321051,1.0,1.0,0,...,0.7,0.7,0.7,0,1,0,0,0,1,0
1,1.0,41,412584.38,C213527060,0.7,0.0,C1324591205,0.5,0.7,0,...,1.0,1.0,1.0,0,1,0,0,0,1,0
2,1.0,376,494649.54,C514416015,0.7,0.0,C742436742,1.0,1.0,0,...,1.0,1.0,1.0,0,1,0,0,0,1,0
3,0.7,159,136776.42,C1189920647,1.0,1.0,C299807793,1.0,1.0,0,...,0.7,0.7,0.7,1,0,0,0,0,1,0
4,1.0,211,316716.87,C1946560521,1.0,1.0,C18994171,0.7,0.7,0,...,1.0,1.0,1.0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.5,210,75719.80,C523870076,0.3,0.0,C1589636578,0.5,0.5,0,...,0.5,0.5,0.5,0,1,0,0,0,1,0
4996,0.0,251,8908.60,C1467020963,0.0,0.5,M1161291806,0.0,0.0,0,...,0.0,0.0,0.0,0,0,0,1,0,0,1
4997,0.5,299,45141.99,C1083256837,0.5,0.7,M1020699530,0.0,0.3,0,...,0.5,0.5,0.5,0,0,0,1,0,0,1
4998,0.5,139,48699.70,C1626707703,0.3,0.0,C256674477,0.0,0.3,0,...,0.5,0.5,0.5,0,1,0,0,0,1,0


# ML preparation

In [33]:
# Remove unused columns
df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'amount', 'hour', 'weekday', 'monthday'], axis=1, inplace=True)

In [34]:
# move the 'isFraud' column to the end of the dataframe to become Y column
is_fraud_col = df.pop('isFraud')
df['isFraud'] = is_fraud_col

In [35]:
# Create train/val/test set following 0.7/0.15/0.15 split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=0, shuffle=True)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=0, shuffle=True)

In [36]:
# reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [37]:
save_directory = "../deep-symbolic-optimization/dso/dso/task/regression/data"

# Check if the directory exists, if not, create it
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save DataFrames to CSV files
train_df.to_csv(os.path.join(save_directory, "train_df.csv"), header = False, index=False)
test_df.to_csv(os.path.join(save_directory, "test_df.csv"), header = False, index=False)
val_df.to_csv(os.path.join(save_directory, "val_df.csv"), header = False, index=False)

In [None]:
# [00:00:00:00.25] Training iteration 21, current best R: 0.0385

#         ** New best
#         Reward: 0.038461538461538464
#         Count Off-policy: 0
#         Count On-policy: 1
#         Originally on Policy: True
#         Invalid: False
#         Traversal: product_reichenbach,sub,x4,sub,x1,x2,sub,x1,x5
#         Expression:
#           product_reichenbach(-x₁ + x₂ + x₄, x₁ - x₅)


In [None]:
# x1 = str("-" + df.columns[0] + " + " + df.columns[1] + " + " + df.columns[3])
# x2 = str(df.columns[0] + " - " + df.columns[4])

# print(x1)
# print(x2)
# print("1 - (" + x1 + ") + (" + x1 + ") * (" + x2 + ")")

In [None]:
# -- ANALYZING LOG START --------------
# Task_____________regression
# Source path______./log/dso_task_regression_data_train_df_2024-04-24-112653
# Runs_____________50
# Max Samples/run__20000
# Success_rate_____0.0
# Hall of Fame (Top 5 of 5000)____
#     0: S=015 R=0.166667 <-- product_reichenbach(x2 + x7, (x11 - x2)*(x11 - x8)*(x13 + x5))
#     1: S=010 R=0.166667 <-- product_reichenbach(x17 + x18, x2*(-x10 + x9))
#     2: S=012 R=0.133333 <-- product_reichenbach(x2, x10*(x1 - x10)*(-x13*x8 + x2))
#     3: S=037 R=0.105263 <-- product_reichenbach(x1*x5 + x15 + x6 - x7, x2*(-x11 - x4 + x7 + x8))
#     4: S=036 R=0.076923 <-- product_reichenbach(x2, x14 + x6 - x9 + (x13 - x2)*(-x1 + x10 + x14 + x16))
#   Saving Hall of Fame plot to ./log/dso_task_regression_data_train_df_2024-04-24-112653/dso_dso_task_regression_data_train_df_plot_hof.png
# Pareto Front (5 of 6)____
#     0: S=015 R=0.166667 C=15.00 <-- product_reichenbach(x2 + x7, (x11 - x2)*(x11 - x8)*(x13 + x5))
#     1: S=012 R=0.133333 C=13.00 <-- product_reichenbach(x2, x10*(x1 - x10)*(-x13*x8 + x2))
#     2: S=031 R=0.052632 C=11.00 <-- product_reichenbach(x2, -x15 + x16 - x4 - x5 + x7)
#     3: S=021 R=0.051282 C=9.00 <-- product_reichenbach(x1 - x16 + x2, x16 - x5)
#     4: S=041 R=0.038462 C=7.00 <-- product_reichenbach(x2, x8*(-x5 + x9))
#   Saving Pareto Front plot to ./log/dso_task_regression_data_train_df_2024-04-24-112653/dso_dso_task_regression_data_train_df_plot_pf.png
# -- ANALYZING LOG END ----------------
# == POST-PROCESS END ===================