In [None]:
# ----------------------------------------------------------
# LOGIN-BASED PREPROCESSING + FEATURE ENGINEERING 
# ----------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [None]:
# ----------------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------------
df = pd.read_csv("C:\\Users\\user\\Desktop\\PROJECT SEM\\Fraud-Anomaly-Detection-system\\backend\\app\\ml\\login_activity_data.csv"
)

print("Dataset Loaded!")
print(df.head())
print(df.info())

Dataset Loaded!
   AccountID            Timestamp   Status      IP Address DeviceType  \
0       1098  2024-01-01 02:15:00   Failed  147.227.118.65    Desktop   
1       1226  2024-01-01 04:17:00  Success  193.98.181.171    Desktop   
2       1116  2024-01-01 05:13:00  Success   40.246.100.42    Desktop   
3       1137  2024-01-01 05:16:00  Success  151.144.20.158     Tablet   
4       1101  2024-01-01 05:32:00  Success    185.13.41.14    Desktop   

   Location  LoginAttempts  
0  Pakistan              1  
1       UAE              3  
2       USA              3  
3  Pakistan              2  
4  Pakistan              3  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   AccountID      5000 non-null   int64 
 1   Timestamp      5000 non-null   object
 2   Status         5000 non-null   object
 3   IP Address     5000 non-null   object
 4   De

In [13]:
# ----------------------------------------------------------
# 2. Convert Timestamp Column (USED AS LOGIN TIME)
# ----------------------------------------------------------
df['Timestamp'] = pd.to_datetime(
    df['Timestamp'], errors='coerce'
)

# Sort by user and time
df = df.sort_values(by=['AccountID', 'Timestamp'])

In [15]:
# ----------------------------------------------------------
# 3. LOGIN-BASED FEATURE ENGINEERING 
# ----------------------------------------------------------

# ----------------------------------------------------------
# TIME FEATURES
# ----------------------------------------------------------
df['login_hour'] = df['Timestamp'].dt.hour
df['login_dayofweek'] = df['Timestamp'].dt.dayofweek
df['is_weekend'] = df['login_dayofweek'].isin([5, 6]).astype(int)

# ----------------------------------------------------------
# TIME SINCE LAST LOGIN (PER EVENT)
# ----------------------------------------------------------
df['time_since_last_login'] = (
    df.groupby('AccountID')['Timestamp']
      .diff()
      .dt.total_seconds()
)

# Replace NaN for first login of each user
df['time_since_last_login'] = df['time_since_last_login'].fillna(
    df['time_since_last_login'].median()
)

# ----------------------------------------------------------
# DEVICE / IP / LOCATION CHANGE FLAGS
# ----------------------------------------------------------
df['device_changed'] = (
    df.groupby('AccountID')['DeviceType']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

df['ip_changed'] = (
    df.groupby('AccountID')['IP Address']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

df['location_changed'] = (
    df.groupby('AccountID')['Location']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

# ----------------------------------------------------------
# LOGIN STATUS & ATTEMPT FEATURES
# ----------------------------------------------------------
df['failed_login'] = (df['Status'] == 'Failed').astype(int)
df['high_login_attempts'] = (df['LoginAttempts'] > 3).astype(int)


In [16]:
# ----------------------------------------------------------
# 4. FINAL LOGIN FEATURE MATRIX
# ----------------------------------------------------------
login_df = df[
    [
        'login_hour',
        'login_dayofweek',
        'is_weekend',
        'time_since_last_login',
        'device_changed',
        'ip_changed',
        'location_changed',
        'LoginAttempts',
        'failed_login',
        'high_login_attempts'
    ]
]


In [18]:

# ----------------------------------------------------------
# 5. Handle Missing Values
# ----------------------------------------------------------
login_df = login_df.fillna(
    login_df.median(numeric_only=True)
)

In [19]:
# ----------------------------------------------------------
# 6. Scale Numerical Features
# ----------------------------------------------------------
scale_cols = [
    'login_hour',
    'login_dayofweek',
    'time_since_last_login',
    'LoginAttempts'
]

scaler = StandardScaler()
login_df[scale_cols] = scaler.fit_transform(
    login_df[scale_cols]
)

# Save scaler
joblib.dump(scaler, "login_event_scaler.pkl")

['login_event_scaler.pkl']

In [20]:
# ----------------------------------------------------------
# 7. Save FINAL LOGIN EVENT FEATURES
# ----------------------------------------------------------
login_df.to_csv(
    "login_event_features.csv",
    index=False
)

print("\nLOGIN-BASED PREPROCESSING COMPLETE!")
print("Saved -> login_event_features.csv, login_event_scaler.pkl")
print("\nFinal Login Event Feature Matrix:")
print(login_df.head())


LOGIN-BASED PREPROCESSING COMPLETE!
Saved -> login_event_features.csv, login_event_scaler.pkl

Final Login Event Feature Matrix:
     login_hour  login_dayofweek  is_weekend  time_since_last_login  \
310   -0.345567         0.514377           0              -0.295274   
348    0.236392         1.020157           1              -0.872549   
473   -0.927526         0.008598           0              -0.524244   
600    0.236392        -1.508739           0              -0.555857   
920    1.400309         0.514377           0               0.148359   

     device_changed  ip_changed  location_changed  LoginAttempts  \
310               1           1                 1       0.140228   
348               0           1                 0       0.562603   
473               1           1                 0      -0.704521   
600               1           1                 1       1.407352   
920               1           1                 1       0.562603   

     failed_login  high_login_atte