In [1]:
# ----------------------------------------------------------
# LOGIN-BASED PREPROCESSING + FEATURE ENGINEERING 
# ----------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [2]:
# ----------------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------------
df = pd.read_csv("C:\\Users\\user\\Desktop\\LinkLock_anomalydetection\\LinkLock\\Link-Lock-Fraud-Anomaly-Detection-system-\\backend\\app\\ml\\login_activity_data.csv"
)

print("Dataset Loaded!")
print(df.head())
print(df.info())

Dataset Loaded!
   AccountID            Timestamp   Status       IP Address DeviceType  \
0       1229  2024-01-01 01:46:00  Success   190.230.185.22    Desktop   
1       1068  2024-01-01 02:29:00  Success    32.113.181.58     Mobile   
2       1225  2024-01-01 05:01:00  Success   21.213.246.147    Desktop   
3       1125  2024-01-01 06:50:00  Success     72.49.198.33    Desktop   
4       1056  2024-01-01 07:23:00  Success  236.197.118.251     Tablet   

   Location  LoginAttempts  
0  Pakistan              1  
1  Pakistan              1  
2       UAE              3  
3  Pakistan              1  
4  Pakistan              3  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   AccountID      5000 non-null   int64 
 1   Timestamp      5000 non-null   object
 2   Status         5000 non-null   object
 3   IP Address     5000 non-null   object
 

In [3]:
# ----------------------------------------------------------
# 2. Convert Timestamp Column (USED AS LOGIN TIME)
# ----------------------------------------------------------
df['Timestamp'] = pd.to_datetime(
    df['Timestamp'], errors='coerce'
)

# Sort by user and time
df = df.sort_values(by=['AccountID', 'Timestamp'])

In [1]:
# ----------------------------------------------------------
# 3. LOGIN-BASED FEATURE ENGINEERING 
# ----------------------------------------------------------

# ----------------------------------------------------------
# TIME FEATURES
# ----------------------------------------------------------
df['login_hour'] = df['Timestamp'].dt.hour
df['login_dayofweek'] = df['Timestamp'].dt.dayofweek
df['is_weekend'] = df['login_dayofweek'].isin([5, 6]).astype(int)

# ----------------------------------------------------------
# TIME SINCE LAST LOGIN (PER EVENT)
# ----------------------------------------------------------
df['time_since_last_login'] = (
    df.groupby('AccountID')['Timestamp']
      .diff()
      .dt.total_seconds()
)

# Replace NaN for first login of each user
df['time_since_last_login'] = df['time_since_last_login'].fillna(
    df['time_since_last_login'].median()
)

# ----------------------------------------------------------
# DEVICE / IP / LOCATION CHANGE FLAGS
# ----------------------------------------------------------
df['device_changed'] = (
    df.groupby('AccountID')['DeviceType']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

df['ip_changed'] = (
    df.groupby('AccountID')['IP Address']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

df['location_changed'] = (
    df.groupby('AccountID')['Location']
      .transform(lambda x: x != x.shift())
      .fillna(0)
      .astype(int)
)

# ----------------------------------------------------------
# LOGIN STATUS & ATTEMPT FEATURES
# ----------------------------------------------------------
df['failed_login'] = (df['Status'] == 'Failed').astype(int)
df['high_login_attempts'] = (df['LoginAttempts'] > 3).astype(int)


NameError: name 'df' is not defined

In [5]:
# ----------------------------------------------------------
# 4. FINAL LOGIN FEATURE MATRIX
# ----------------------------------------------------------
login_df = df[
    [
        'login_hour',
        'login_dayofweek',
        'is_weekend',
        'time_since_last_login',
        'device_changed',
        'ip_changed',
        'location_changed',
        'LoginAttempts',
        'failed_login',
        'high_login_attempts'
    ]
]


In [6]:

# ----------------------------------------------------------
# 5. Handle Missing Values
# ----------------------------------------------------------
login_df = login_df.fillna(
    login_df.median(numeric_only=True)
)

In [7]:
# ----------------------------------------------------------
# 6. Scale Numerical Features
# ----------------------------------------------------------
scale_cols = [
    'login_hour',
    'login_dayofweek',
    'time_since_last_login',
    'LoginAttempts'
]

scaler = StandardScaler()
login_df[scale_cols] = scaler.fit_transform(
    login_df[scale_cols]
)

# Save scaler
joblib.dump(scaler, "login_event_scaler.pkl")

['login_event_scaler.pkl']

In [8]:
# ----------------------------------------------------------
# 7. Save FINAL LOGIN EVENT FEATURES
# ----------------------------------------------------------
login_df.to_csv(
    "login_event_features.csv",
    index=False
)

print("\nLOGIN-BASED PREPROCESSING COMPLETE!")
print("Saved -> login_event_features.csv, login_event_scaler.pkl")
print("\nFinal Login Event Feature Matrix:")
print(login_df.head())


LOGIN-BASED PREPROCESSING COMPLETE!
Saved -> login_event_features.csv, login_event_scaler.pkl

Final Login Event Feature Matrix:
     login_hour  login_dayofweek  is_weekend  time_since_last_login  \
19    -1.522355        -1.019547           0              -0.299736   
137    0.060015         1.494874           1              -0.445798   
263   -1.666206         0.489105           0              -0.543571   
493    1.210829         0.489105           0              -0.205948   
668   -1.090799         0.489105           0              -0.356842   

     device_changed  ip_changed  location_changed  LoginAttempts  \
19                1           1                 1       0.039828   
137               1           1                 1      -0.845240   
263               1           1                 0       0.924896   
493               1           1                 1      -0.845240   
668               0           1                 0       0.039828   

     failed_login  high_login_atte