In [1]:
# -------------------------------------------------------------
# TRANSACTION-BASED PREPROCESSING & FEATURE ENGINEERING
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

In [2]:
# -------------------------------------------------------------
# 1. LOAD DATA
# -------------------------------------------------------------

df = pd.read_csv("C:\\Users\\user\\Desktop\\Anomaly Detection\\Linklock\\bank_transactions_data_2.csv")
print("Dataset Loaded:")
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\user\\Desktop\\Anomaly Detection\\Linklock\\bank_transactions_data_2.csv'

In [3]:
# -------------------------------------------------------------
# 2. CONVERT DATE COLUMNS
# -------------------------------------------------------------

df["TransactionDate"] = pd.to_datetime(df["TransactionDate"], errors="coerce")
df["PreviousTransactionDate"] = pd.to_datetime(df["PreviousTransactionDate"], errors="coerce")

# Sort by date per account
df = df.sort_values(by=["AccountID", "TransactionDate"])

In [4]:
# -------------------------------------------------------------
# 3. TIME-BASED FEATURES
# -------------------------------------------------------------

df["hour"] = df["TransactionDate"].dt.hour
df["day_of_week"] = df["TransactionDate"].dt.dayofweek
df["day"] = df["TransactionDate"].dt.day

# Time between this and last transaction
df["time_since_last"] = df.groupby("AccountID")["TransactionDate"].diff().dt.total_seconds()

# Replace missing with median
df["time_since_last"] = df["time_since_last"].fillna(df["time_since_last"].median())


In [5]:
# -------------------------------------------------------------
# 4. TRANSACTION FREQUENCY FEATURES
# -------------------------------------------------------------

# Ensure data is sorted for merge_asof
df = df.sort_values(["AccountID", "TransactionDate"]).reset_index(drop=True)

# Initialize new columns
df["tx_last_1hr"] = 0
df["tx_last_24hr"] = 0

# Loop through each account independently
for acc in df["AccountID"].unique():

    # Subset data for one user
    sub = df[df["AccountID"] == acc].copy()

    # Create a helper frame for counting
    helper = sub[["TransactionDate"]].copy()
    helper["count"] = 1  # each row = 1 transaction

    # --- Calculate transactions in last 1 hour ---
    merged_1h = pd.merge_asof(
        sub.sort_values("TransactionDate"),
        helper.sort_values("TransactionDate"),
        on="TransactionDate",
        direction="backward",
        tolerance=pd.Timedelta("1h")
    )

    df.loc[sub.index, "tx_last_1hr"] = merged_1h["count"].cumsum()

    # --- Calculate transactions in last 24 hours ---
    merged_24h = pd.merge_asof(
        sub.sort_values("TransactionDate"),
        helper.sort_values("TransactionDate"),
        on="TransactionDate",
        direction="backward",
        tolerance=pd.Timedelta("24h")
    )

    df.loc[sub.index, "tx_last_24hr"] = merged_24h["count"].cumsum()


In [6]:

# -------------------------------------------------------------
# 5. AMOUNT-BASED FEATURES
# -------------------------------------------------------------

# Z-score per user
df["amount_zscore"] = df.groupby("AccountID")["TransactionAmount"].transform(
    lambda x: (x - x.mean()) / x.std()
)

df["amount_zscore"] = df["amount_zscore"].fillna(0)

# Ratio to account balance
df["amount_to_balance"] = df["TransactionAmount"] / (df["AccountBalance"] + 1)


In [7]:

# -------------------------------------------------------------
# 6. DEVICE/IP/LOCATION FEATURES
# -------------------------------------------------------------

# Device change
df["prev_device"] = df.groupby("AccountID")["DeviceID"].shift(1)
df["device_changed"] = (df["DeviceID"] != df["prev_device"]).astype(int)

# IP change
df["prev_ip"] = df.groupby("AccountID")["IP Address"].shift(1)
df["ip_changed"] = (df["IP Address"] != df["prev_ip"]).astype(int)

# Location change
df["prev_loc"] = df.groupby("AccountID")["Location"].shift(1)
df["location_changed"] = (df["Location"] != df["prev_loc"]).astype(int)

# Fill missing
df[["device_changed", "ip_changed", "location_changed"]] = df[["device_changed", "ip_changed", "location_changed"]].fillna(0)

In [8]:

# -------------------------------------------------------------
# 7. CATEGORY ENCODING
# -------------------------------------------------------------

categorical_cols = [
    "TransactionType",
    "Location",
    "DeviceID",
    "IP Address",
    "MerchantID",
    "Channel",
    "CustomerOccupation"
]

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

joblib.dump(encoders, "transaction_encoders.pkl")


['transaction_encoders.pkl']

In [9]:
# -------------------------------------------------------------
# 8. SELECT FINAL TRANSACTION FEATURES
# -------------------------------------------------------------

feature_cols = [
    "TransactionAmount",
    "hour", "day_of_week", "day",
    "time_since_last",
    "tx_last_1hr", "tx_last_24hr",
    "amount_zscore",
    "amount_to_balance",
    "device_changed", "ip_changed", "location_changed",
    "TransactionType", "Location", "DeviceID",
    "IP Address", "MerchantID", "Channel",
    "CustomerAge", "LoginAttempts", "AccountBalance"
]

X = df[feature_cols]

In [10]:
# -------------------------------------------------------------
# 9. SCALING
# -------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, "transaction_scaler.pkl")

['transaction_scaler.pkl']

In [17]:
# -------------------------------------------------------------
# 10. SAVE FINAL PREPROCESSED DATA
# -------------------------------------------------------------

final_df = pd.DataFrame(X_scaled, columns=feature_cols)
final_df.to_csv("transaction_features.csv", index=False)

print("\nTransaction-based preprocessing complete!")
print("Saved: transaction_features.csv, transaction_scaler.pkl, transaction_encoders.pkl")


Transaction-based preprocessing complete!
Saved: transaction_features.csv, transaction_scaler.pkl, transaction_encoders.pkl
