In [1]:
# ----------------------------------------------------------
# LOGIN ANOMALY DETECTION MODEL TRAINING (ISOLATION FOREST)
# ----------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import joblib


In [2]:
# ----------------------------------------------------------
# 1. Load Preprocessed Login Feature Dataset
# ----------------------------------------------------------
login_df = pd.read_csv(
    "login_event_features.csv"
)

print("Login Feature Dataset Loaded!")
print(login_df.head())
print(login_df.shape)


Login Feature Dataset Loaded!
   login_hour  login_dayofweek  is_weekend  time_since_last_login  \
0   -1.522355        -1.019547           0              -0.299736   
1    0.060015         1.494874           1              -0.445798   
2   -1.666206         0.489105           0              -0.543571   
3    1.210829         0.489105           0              -0.205948   
4   -1.090799         0.489105           0              -0.356842   

   device_changed  ip_changed  location_changed  LoginAttempts  failed_login  \
0               1           1                 1       0.039828             0   
1               1           1                 1      -0.845240             0   
2               1           1                 0       0.924896             0   
3               1           1                 1      -0.845240             0   
4               0           1                 0       0.039828             0   

   high_login_attempts  
0                    0  
1                    0  

In [3]:

# ----------------------------------------------------------
# 2. Convert DataFrame to Model Input
# ----------------------------------------------------------
X = login_df.values


In [4]:
# ----------------------------------------------------------
# 3. Train Isolation Forest Model
# ----------------------------------------------------------
iso_forest = IsolationForest(
    n_estimators=200,        # number of trees
    contamination=0.02,      # expected anomaly ratio (2%)
    max_samples='auto',
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X)

print("\nIsolation Forest Model Trained Successfully!")


Isolation Forest Model Trained Successfully!


In [5]:
# ----------------------------------------------------------
# 4. Generate Anomaly Predictions
# ----------------------------------------------------------
login_df['anomaly_label'] = iso_forest.predict(X)
# -1 = anomaly, 1 = normal

login_df['anomaly_score'] = iso_forest.decision_function(X)
# Lower score = more anomalous

In [6]:
# ----------------------------------------------------------
# 5. Map Labels to Human-Readable Format
# ----------------------------------------------------------
login_df['is_anomaly'] = (login_df['anomaly_label'] == -1).astype(int)

In [7]:

# ----------------------------------------------------------
# 6. Save Model and Scored Dataset
# ----------------------------------------------------------
joblib.dump(iso_forest, "login_isolation_forest.pkl")

login_df.to_csv(
    "login_events_scored.csv",
    index=False
)


In [8]:

# ----------------------------------------------------------
# 7. Basic Anomaly Statistics
# ----------------------------------------------------------
total_events = len(login_df)
anomalies = login_df['is_anomaly'].sum()

print("\nAnomaly Detection Summary")
print(f"Total Login Events : {total_events}")
print(f"Detected Anomalies : {anomalies}")
print(f"Anomaly Percentage : {(anomalies / total_events) * 100:.2f}%")

print("\nSaved Files:")
print("- login_isolation_forest.pkl")
print("- login_events_scored.csv")


Anomaly Detection Summary
Total Login Events : 5000
Detected Anomalies : 100
Anomaly Percentage : 2.00%

Saved Files:
- login_isolation_forest.pkl
- login_events_scored.csv
