In [1]:
python_material_folder_name = "python-material"



In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check if in Google Colab environment
try:
    from google.colab import drive
    # Mount drive
    drive.mount('/content/drive')
    # Set up path to Python material parent folder
    path_python_material = rf"drive/MyDrive/{python_material_folder_name}"
        # If unsure, print current directory path by executing the following in a new cell:
        # !pwd
    IN_COLAB = True
except:
    IN_COLAB = False
    # If working locally on Jupyter Notebook, parent folder is one folder up (assuming you are using the folder structure shared at the beginning of the course)
    path_python_material = ".."

In [3]:
# Set the random seed for reproducibility
np.random.seed(42)

# Number of observations
n = 100000

# Generating random data for each feature
transaction_amount = np.random.exponential(scale=100, size=n)  # Exponential distribution for transaction amounts
customer_age = np.random.normal(loc=35, scale=10, size=n).clip(18, 80).astype(int)  # Normally distributed customer ages
customer_balance = np.random.normal(loc=5000, scale=2000, size=n).clip(0, None)  # Customer balances with some minimum of 0
transaction_time = np.random.randint(0, 86400, size=n)  # Time of transaction in seconds since midnight
transaction_date = [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(n)]


In [5]:

# Feature engineering: Higher amounts, younger ages, and lower balances may indicate fraud
fraud_probability = (
    0.001 +  # Base probability
    0.00001 * transaction_amount +  # Higher transaction amount slightly increases fraud probability
    0.0001 * (80 - customer_age) +  # Younger customers are slightly more prone to fraud
    0.00001 * (5000 - customer_balance)  # Lower balances slightly increase fraud probability
).clip(0, 1)
fraud_probability

array([0.        , 0.02474385, 0.01384838, ..., 0.01714096, 0.00693694,
       0.        ])

In [6]:
# Adding randomness/noise
fraud_probability = fraud_probability * (0.8 + 0.4 * np.random.rand(n))

# Generating fraud flags
is_fraud = np.random.binomial(1, fraud_probability)
np.mean(is_fraud)

0.01159

In [15]:
print("****")
for i in range(0, 100, 20):
    print(np.percentile(fraud_probability, i))

print("****")
for i in range(90, 100, 1):
    print(np.percentile(fraud_probability, i))

****
0.0
0.0
0.0014648250893397213
0.011574452177355539
0.02335527608054668
****
0.032267770486618594
0.033494606780467076
0.03481957900605098
0.03625396445658421
0.03795343424219789
0.03982395768933808
0.04197956172268464
0.044708299292144225
0.04820605279304463
0.053632974116227465


In [8]:
# Assemble into a DataFrame
df = pd.DataFrame({
    'transaction_amount': transaction_amount,
    'transaction_date': transaction_date,
    'transaction_time': transaction_time,
    'customer_age': customer_age,
    'customer_balance': customer_balance,
    'is_fraud': is_fraud
})

# Show the first few rows of the dataset
df.head()


Unnamed: 0,transaction_amount,transaction_date,transaction_time,customer_age,customer_balance,is_fraud
0,46.926809,2023-02-09,55817,43,9143.802446,0
1,301.012143,2023-01-28,9356,60,3126.627558,0
2,131.674569,2023-11-13,33099,33,4316.836831,0
3,91.294255,2023-03-26,3190,18,4235.945356,0
4,16.962487,2023-12-07,13332,49,5491.237144,0


In [9]:
# Data export
df.to_csv(f"{path_python_material}/data/1-raw/dsif11-fraud-detection/synthetic_transaction_data.csv", index=False)


In [10]:
df.dtypes

transaction_amount           float64
transaction_date      datetime64[ns]
transaction_time               int64
customer_age                   int64
customer_balance             float64
is_fraud                       int64
dtype: object

In [11]:
df.shape

(100000, 6)