In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Load the cleaned dataset
data_path = '/content/drive/MyDrive/Fraud Detection/bank_transactions_cleaned.csv'
df = pd.read_csv(data_path)

print("Starting Feature Engineering...")

Starting Feature Engineering...


#  Transaction Amount Features


In [4]:
# Create a feature for large transactions

df['is_large_transaction'] = np.where(df['TransactionAmount'] > df['TransactionAmount'].quantile(0.95), 1, 0)

# Log transform transaction amount to reduce skewness

df['log_transaction_amount'] = np.log1p(df['TransactionAmount'])

In [5]:
df['log_transaction_amount']

Unnamed: 0,log_transaction_amount
0,2.714032
1,5.932882
2,4.846468
3,5.223055
4,2.670694
...,...
2367,6.753683
2368,5.531570
2369,3.388787
2370,5.230948


#Time-Based Features (For Real-Time Monitoring)

In [7]:
# Assuming 'TransactionDate' exists, and is in the format 'dd/mm/yyyy HH:MM'

df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], format='%d/%m/%Y %H:%M') # Specify the correct format

# Extract time features

df['transaction_hour'] = df['TransactionDate'].dt.hour

df['transaction_day_of_week'] = df['TransactionDate'].dt.dayofweek

# Flag transactions happening at odd hours (e.g., midnight to 5 am)
df['odd_hour_transaction'] = np.where((df['transaction_hour'] >= 0) & (df['transaction_hour'] <= 5), 1, 0)

In [8]:
df['TransactionDate']

Unnamed: 0,TransactionDate
0,2023-04-11 16:29:00
1,2023-06-27 16:44:00
2,2023-07-10 18:16:00
3,2023-05-05 16:32:00
4,2023-10-16 17:51:00
...,...
2367,2023-04-26 17:09:00
2368,2023-03-22 17:36:00
2369,2023-08-21 17:08:00
2370,2023-02-24 16:24:00


In [9]:
df['transaction_hour']

Unnamed: 0,transaction_hour
0,16
1,16
2,18
3,16
4,17
...,...
2367,17
2368,17
2369,17
2370,16


In [10]:
df['transaction_day_of_week']

Unnamed: 0,transaction_day_of_week
0,1
1,1
2,0
3,4
4,0
...,...
2367,2
2368,2
2369,0
2370,4


In [11]:
df['odd_hour_transaction']

Unnamed: 0,odd_hour_transaction
0,0
1,0
2,0
3,0
4,0
...,...
2367,0
2368,0
2369,0
2370,0


# Behavioral Profiling Features

In [12]:
# Transactions count per user

if 'AccountID' in df.columns:
    user_transaction_counts = df.groupby('AccountID')['TransactionID'].transform('count')
    df['user_transaction_count'] = user_transaction_counts

    # Average transaction amount per user

    user_avg_amount = df.groupby('AccountID')['TransactionAmount'].transform('mean')
    df['user_avg_transaction_amount'] = user_avg_amount

    # Deviation from user's average transaction amount

    df['deviation_from_user_avg'] = df['TransactionAmount'] - df['user_avg_transaction_amount']

In [13]:
df['user_transaction_count']

Unnamed: 0,user_transaction_count
0,6
1,7
2,4
3,7
4,6
...,...
2367,10
2368,9
2369,7
2370,4


In [14]:
df['user_avg_transaction_amount']

Unnamed: 0,user_avg_transaction_amount
0,304.550000
1,304.622857
2,237.047500
3,249.954286
4,280.796667
...,...
2367,273.373000
2368,340.741111
2369,249.978571
2370,312.650000


In [15]:
df['deviation_from_user_avg']

Unnamed: 0,deviation_from_user_avg
0,-290.460000
1,71.617143
2,-110.757500
3,-65.454286
4,-267.346667
...,...
2367,582.837000
2368,-89.201111
2369,-221.348571
2370,-126.680000


# Geographical Behavior Features

In [18]:
if 'Location' in df.columns: # Change 'location' to 'Location' if the column name is with capital L.
    # Frequent location for user
    user_most_common_location = df.groupby('AccountID')['Location'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
    df['user_primary_location'] = user_most_common_location

    # Flag if transaction location is different from user's usual location
    df['is_unusual_location'] = np.where(df['Location'] != df['user_primary_location'], 1, 0)
else:
  print("Location column does not exist in the DataFrame")

In [19]:
df['user_primary_location']

Unnamed: 0,user_primary_location
0,Chicago
1,Baltimore
2,Louisville
3,Charlotte
4,Atlanta
...,...
2367,Chicago
2368,Albuquerque
2369,Atlanta
2370,Charlotte


In [20]:
df['is_unusual_location']

Unnamed: 0,is_unusual_location
0,1
1,1
2,1
3,1
4,0
...,...
2367,1
2368,1
2369,1
2370,1


# Regulatory Compliance Reporting Features

In [21]:
# High-Risk Country Flag (simulate FCA high-risk country list, if only the dataset contains "country colum")

high_risk_countries = ['Nigeria', 'Russia', 'Iran', 'North Korea']  # Example
if 'country' in df.columns:
    df['high_risk_country'] = np.where(df['country'].isin(high_risk_countries), 1, 0)



# Final Steps: Save Enhanced Dataset

In [23]:
enhanced_data_path = '/content/drive/MyDrive/Fraud Detection/bank_transactions_featured.csv'
os.makedirs(os.path.dirname(enhanced_data_path), exist_ok=True)
df.to_csv(enhanced_data_path, index=False)

print(f"Feature engineering completed. Enhanced data saved to {enhanced_data_path}")

Feature engineering completed. Enhanced data saved to /content/drive/MyDrive/Fraud Detection/bank_transactions_featured.csv
