# Loading data and requirements

In [66]:
!pip install -r requirements.txt



In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [68]:
df = pd.read_csv('../data/paysim.csv')

In [69]:
# # Assuming your DataFrame is named df
# # Define the distribution percentages
# percentage_isFraud_0 = 99.87
# percentage_isFraud_1 = 0.13

# # Calculate the number of instances for each category based on the desired percentages
# total_sample_size = 10000  # You can adjust this as per your requirement
# num_isFraud_0 = int(total_sample_size * percentage_isFraud_0 / 100)
# num_isFraud_1 = total_sample_size - num_isFraud_0

# # Generate the sample DataFrame with the desired distribution
# sample_isFraud_0 = df[df['isFraud'] == 0].sample(n=num_isFraud_0, replace=True)
# sample_isFraud_1 = df[df['isFraud'] == 1].sample(n=num_isFraud_1, replace=True)

# # Concatenate the samples to form the final sample DataFrame
# sample = pd.concat([sample_isFraud_0, sample_isFraud_1])

# # Shuffle the rows to randomize the order
# df = sample.sample(frac=1).reset_index(drop=True)

# Data preparation

In [70]:
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})

In [71]:
# Set new balance and original balance based on transaction amount based on EDA
# Percentage of observations with balance errors in the account giving money:  85.0
# Percentage of observations with balance errors in the account receiving money:  100.0

df['newbalanceDest'] = df['oldbalanceDest'] + df['amount']
df['oldbalanceOrig'] = df['newbalanceOrig'] + df['amount']

In [72]:
# Only 6 true
df['externalDest'] = ((df['oldbalanceDest'] == 0) & (df['newbalanceDest'] == 0)).astype(int)
# Only 16 true
df['externalOrig'] = ((df['oldbalanceOrig'] == 0) & (df['newbalanceOrig'] == 0)).astype(int)

# Feature engineering

In [73]:
# Extracting hour of the day from the 'step' column
df['hour'] = df['step']% 24

# Extracting day of the week as integers, add 3 to convert it to correct days of the week (1 = monday, 7 = sunday)
df['weekday'] = (df['step'] // 24) % 7 + 1

# Create is_workday feature based on the 2 least transaction dates being the weekend
df['is_workday'] = df['weekday'].apply(lambda x: 0 if x == 4 or x == 5 else 1)

# Extracting day of the week as integers
df['monthday'] = (df['step'] % 30) + 1

In [None]:
# calculate the rolling average of last 3 and 7 transactions for each recipient
df['meanDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['meanDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# calculate the rolling maximum of last 3 and 7 transactions for each recipient
df['maxDest3'] = df.groupby('nameDest')['amount'].rolling(window=3, min_periods=1).max().reset_index(0, drop=True)
df['maxDest7'] = df.groupby('nameDest')['amount'].rolling(window=7, min_periods=1).max().reset_index(0, drop=True)

# Transformation

In [1]:
df = pd.read_csv("../data/paysim_rolling.csv")

NameError: name 'pd' is not defined

In [20]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrig',
       'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest',
       'isFraud', 'isFlaggedFraud', 'externalDest', 'externalOrig', 'hour',
       'weekday', 'is_workday', 'monthday', 'meanDest3', 'meanDest7',
       'maxDest3', 'maxDest7'],
      dtype='object')

In [21]:
# # One hot encode type columns
df = pd.get_dummies(df, columns=['type'])

In [22]:
# Log scale amount
df['log_amount'] = np.log(df['amount'])
df['log_meanDest3'] = np.log(df['meanDest3'])
df['log_maxDest3'] = np.log(df['maxDest3'])
df['log_meanDest7'] = np.log(df['meanDest7'])
df['log_maxDest7'] = np.log(df['maxDest7'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
columns = ['log_amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'log_meanDest3', 'log_meanDest7', 'log_maxDest3', 'log_maxDest7', 'amount', 'meanDest3', 'meanDest7', 'maxDest3', 'maxDest7']
def create_fuzzy_sets_percentile(column_data, percentiles):
    fuzzy_sets = []
    for value in column_data:
        if value <= percentiles[0.2]:
            fuzzy_sets.append(0.2)
        elif value <= percentiles[0.4]:
            fuzzy_sets.append(0.4)
        elif value <= percentiles[0.6]:
            fuzzy_sets.append(0.6)
        elif value <= percentiles[0.8]:
            fuzzy_sets.append(0.8)
        else:
            fuzzy_sets.append(1.0)
    return fuzzy_sets

# Calculate percentiles for each column
percentiles = df.quantile([0.2, 0.4, 0.6, 0.8]).to_dict()

# Create fuzzy sets for each column
fuzzy_sets_percentile = {}
for column in columns:
    fuzzy_sets_percentile[column] = create_fuzzy_sets_percentile(df[column], percentiles[column])

# Overwrite the values in the DataFrame with the new fuzzy sets values
for column in columns:
    df[column] = fuzzy_sets_percentile[column]

# Print or use the DataFrame with fuzzy sets values
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,monthday,meanDest3,meanDest7,maxDest3,maxDest7,log_amount,log_meanDest3,log_maxDest3,log_meanDest7,log_maxDest7
0,1,PAYMENT,0.2,C1231006815,0.6,0.8,M1979787155,0.2,0.2,0,...,2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
1,1,PAYMENT,0.2,C1666544295,0.2,0.8,M2044282225,0.2,0.2,0,...,2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
2,1,TRANSFER,0.2,C1305486145,0.2,0.2,C553264065,0.2,0.2,1,...,2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
3,1,CASH_OUT,0.2,C840083671,0.2,0.2,C38997010,0.6,0.4,1,...,2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
4,1,PAYMENT,0.4,C2048537720,0.4,0.8,M1230701703,0.2,0.4,0,...,2,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,1.0,C786484425,0.8,0.2,C776919290,0.2,0.6,1,...,24,1.0,1.0,0.8,0.8,1.0,1.0,0.8,1.0,0.8
6362616,743,TRANSFER,1.0,C1529008245,1.0,0.2,C1881841831,0.2,1.0,1,...,24,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6362617,743,CASH_OUT,1.0,C1162922333,1.0,0.2,C1365125890,0.6,1.0,1,...,24,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6362618,743,TRANSFER,1.0,C1685995037,1.0,0.2,C2080388513,0.2,0.8,1,...,24,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [2]:
import pandas as pd
df = pd.read_csv("../data/paysim_fuzzy.csv")

# ML preparation

In [3]:
# Remove unused columns
df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'amount', 'hour', 'weekday', 'monthday', 'step'], axis=1, inplace=True)

In [4]:
fraud_col = df.pop('isFraud')
df['isFraud'] = fraud_col

In [6]:
# # Assuming your DataFrame is named df
# # Define the distribution percentages
# percentage_isFraud_0 = 99.87
# percentage_isFraud_1 = 0.13

# # Calculate the number of instances for each category based on the desired percentages
# total_sample_size = 500000  # You can adjust this as per your requirement
# num_isFraud_0 = int(total_sample_size * percentage_isFraud_0 / 100)
# num_isFraud_1 = total_sample_size - num_isFraud_0

# # Generate the sample DataFrame with the desired distribution
# sample_isFraud_0 = df[df['isFraud'] == 0].sample(n=num_isFraud_0, replace=True)
# sample_isFraud_1 = df[df['isFraud'] == 1].sample(n=num_isFraud_1, replace=True)

# # Concatenate the samples to form the final sample DataFrame
# sample = pd.concat([sample_isFraud_0, sample_isFraud_1])

# # Shuffle the rows to randomize the order
# df = sample.sample(frac=1).reset_index(drop=True)

In [5]:
from sklearn.model_selection import train_test_split

# Create train/val/test set following 0.8/0.1/0.1 split with balanced classes
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['isFraud'], random_state=0, shuffle=True)
# test_df, val_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['isFraud'], random_state=0, shuffle=True)


In [6]:
# reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# val_df = val_df.reset_index(drop=True)

In [7]:
# save_directory = "../deep-symbolic-optimization/dso/dso/task/regression/data"

save_directory = "../data"

# Check if the directory exists, if not, create it
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save DataFrames to CSV files
train_df.to_csv(os.path.join(save_directory, "train_df.csv"), header = False, index=False)
test_df.to_csv(os.path.join(save_directory, "test_df.csv"), header = False, index=False)
# val_df.to_csv(os.path.join(save_directory, "val_df.csv"), header = False, index=False)

In [12]:
# Calculate and print the balance in the train dataset
train_balance = train_df['isFraud'].value_counts(normalize=True)
print("Train set balance:\n", train_balance, train_df.shape)

# Calculate and print the balance in the test dataset
test_balance = test_df['isFraud'].value_counts(normalize=True)
print("Test set balance:\n", test_balance, test_df.shape)

Train set balance:
 0    0.9987
1    0.0013
Name: isFraud, dtype: float64 (400000, 22)
Test set balance:
 0    0.9987
1    0.0013
Name: isFraud, dtype: float64 (100000, 22)
