In [1]:
# ----------------------------------------------------------
# USER-BASED PREPROCESSING + FEATURE ENGINEERING
# ----------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib


In [2]:
# ----------------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------------

df = pd.read_csv("C:\\Users\\user\\Desktop\\Anomaly Detection\\Linklock\\bank_transactions_data_2.csv")

print("Dataset Loaded!")
print(df.head())
print(df.info())

Dataset Loaded!
  TransactionID AccountID  TransactionAmount      TransactionDate  \
0      TX000001   AC00128              14.09  2023-04-11 16:29:14   
1      TX000002   AC00455             376.24  2023-06-27 16:44:19   
2      TX000003   AC00019             126.29  2023-07-10 18:16:08   
3      TX000004   AC00070             184.50  2023-05-05 16:32:11   
4      TX000005   AC00411              13.45  2023-10-16 17:51:24   

  TransactionType   Location DeviceID      IP Address MerchantID Channel  \
0           Debit  San Diego  D000380  162.198.218.92       M015     ATM   
1           Debit    Houston  D000051     13.149.61.4       M052     ATM   
2           Debit       Mesa  D000235  215.97.143.157       M009  Online   
3           Debit    Raleigh  D000187  200.13.225.150       M002  Online   
4          Credit    Atlanta  D000308    65.164.3.100       M091  Online   

   CustomerAge CustomerOccupation  TransactionDuration  LoginAttempts  \
0           70             Doctor      

In [3]:
# ----------------------------------------------------------
# 2. Convert Date Columns
# ----------------------------------------------------------

df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')
df['PreviousTransactionDate'] = pd.to_datetime(df['PreviousTransactionDate'], errors='coerce')

# Sort by date to compute transitions
df = df.sort_values(by=['AccountID', 'TransactionDate'])

In [4]:
# ----------------------------------------------------------
# 3. USER-BASED FEATURE ENGINEERING
# ----------------------------------------------------------

user_df = pd.DataFrame()

# Base User Identifier
user_df['AccountID'] = df.groupby('AccountID').size().index

# ---- LOGIN PATTERN FEATURES ----

# Avg login hour
user_df['avg_login_hour'] = df.groupby('AccountID')['TransactionDate'].apply(lambda x: x.dt.hour.mean())

# Std deviation of login hour (variability)
user_df['std_login_hour'] = df.groupby('AccountID')['TransactionDate'].apply(lambda x: x.dt.hour.std())

# Day of week login pattern
user_df['most_common_dow'] = df.groupby('AccountID')['TransactionDate'].apply(lambda x: x.dt.dayofweek.mode()[0])


# ---- LOGIN FREQUENCY FEATURES ----

# Time between two consecutive logins (seconds)
df['time_between_logins'] = df.groupby('AccountID')['TransactionDate'].diff().dt.total_seconds()

# Replace missing values with median
df['time_between_logins'] = df['time_between_logins'].fillna(df['time_between_logins'].median())

# Avg login interval
user_df['avg_login_interval'] = df.groupby('AccountID')['time_between_logins'].mean()

# Std deviation of login interval
user_df['std_login_interval'] = df.groupby('AccountID')['time_between_logins'].std()


# ---- DEVICE / IP / LOCATION CONSISTENCY ----

# How many unique devices does a user normally use?
user_df['unique_devices'] = df.groupby('AccountID')['DeviceID'].nunique()

# Unique IP addresses
user_df['unique_ips'] = df.groupby('AccountID')['IP Address'].nunique()

# Unique locations
user_df['unique_locations'] = df.groupby('AccountID')['Location'].nunique()

# Ratio of unusual device changes
user_df['device_change_rate'] = user_df['unique_devices'] / df.groupby('AccountID').size()

# Ratio of unusual IP changes
user_df['ip_change_rate'] = user_df['unique_ips'] / df.groupby('AccountID').size()


# ---- FAILED LOGIN ATTEMPTS ----

user_df['avg_login_attempts'] = df.groupby('AccountID')['LoginAttempts'].mean()
user_df['max_login_attempts'] = df.groupby('AccountID')['LoginAttempts'].max()



In [5]:

# ----------------------------------------------------------
# 4. Handle missing values
# ----------------------------------------------------------

user_df = user_df.fillna(user_df.median(numeric_only=True))


In [None]:
# ----------------------------------------------------------
# 5. Encode Categorical Columns 
# ----------------------------------------------------------

encoders = {}

enc = LabelEncoder()
user_df['AccountID'] = enc.fit_transform(user_df['AccountID'])
encoders['AccountID'] = enc

# Save encoder
joblib.dump(encoders, "user_encoders.pkl")

['user_encoders.pkl']

In [7]:
# ----------------------------------------------------------
# 6. Scale Numerical Features
# ----------------------------------------------------------

scale_cols = [
    'avg_login_hour', 'std_login_hour', 'most_common_dow',
    'avg_login_interval', 'std_login_interval',
    'unique_devices', 'unique_ips', 'unique_locations',
    'device_change_rate', 'ip_change_rate',
    'avg_login_attempts', 'max_login_attempts'
]

scaler = StandardScaler()
user_df[scale_cols] = scaler.fit_transform(user_df[scale_cols])

# Save scaler
joblib.dump(scaler, "user_scaler.pkl")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


['user_scaler.pkl']

In [8]:
# ----------------------------------------------------------
# 7. Save final user feature matrix
# ----------------------------------------------------------

user_df.to_csv("user_behavior_features.csv", index=False)

print("\nUSER-BASED PREPROCESSING COMPLETE!")
print("Saved -> user_behavior_features.csv, user_scaler.pkl, user_encoders.pkl")
print("\nFinal User Behavior Feature Matrix:")
print(user_df.head())


USER-BASED PREPROCESSING COMPLETE!
Saved -> user_behavior_features.csv, user_scaler.pkl, user_encoders.pkl

Final User Behavior Feature Matrix:
   AccountID  avg_login_hour  std_login_hour  most_common_dow  \
0          0             NaN             NaN              NaN   
1          1             NaN             NaN              NaN   
2          2             NaN             NaN              NaN   
3          3             NaN             NaN              NaN   
4          4             NaN             NaN              NaN   

   avg_login_interval  std_login_interval  unique_devices  unique_ips  \
0                 NaN                 NaN             NaN         NaN   
1                 NaN                 NaN             NaN         NaN   
2                 NaN                 NaN             NaN         NaN   
3                 NaN                 NaN             NaN         NaN   
4                 NaN                 NaN             NaN         NaN   

   unique_locations  devi