In [2]:
# ---------------------------------------------------------
# USER BEHAVIOR ANOMALY DETECTION MODEL TRAINING
# ---------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import joblib


In [4]:
# ---------------------------------------------------------
# 1. Load the preprocessed USER dataset
# ---------------------------------------------------------

user_df = pd.read_csv("user_behavior_features.csv")

print("User dataset loaded!")
print(user_df.head())

# Drop AccountID for model training (it is an identifier)
X_user = user_df.drop(columns=['AccountID'])

User dataset loaded!
   AccountID  avg_login_hour  std_login_hour  most_common_dow  \
0          0             NaN             NaN              NaN   
1          1             NaN             NaN              NaN   
2          2             NaN             NaN              NaN   
3          3             NaN             NaN              NaN   
4          4             NaN             NaN              NaN   

   avg_login_interval  std_login_interval  unique_devices  unique_ips  \
0                 NaN                 NaN             NaN         NaN   
1                 NaN                 NaN             NaN         NaN   
2                 NaN                 NaN             NaN         NaN   
3                 NaN                 NaN             NaN         NaN   
4                 NaN                 NaN             NaN         NaN   

   unique_locations  device_change_rate  ip_change_rate  avg_login_attempts  \
0               NaN                 NaN             NaN               

In [3]:
# ---------------------------------------------------------
# 2. Train Isolation Forest Model
# ---------------------------------------------------------

iso_user = IsolationForest(
    n_estimators=250,
    contamination=0.05,     # You can tune this (represents anomaly percentage)
    max_samples='auto',
    random_state=42,
    bootstrap=False
)

iso_user.fit(X_user)

print("\nUser Isolation Forest model trained successfully.")



User Isolation Forest model trained successfully.


In [7]:
# ---------------------------------------------------------
# 3. Predict anomalies
# ---------------------------------------------------------

pred = iso_user.predict(X_user)

# Convert:
#   1  -> normal
#  -1 -> anomaly
user_df['anomaly'] = np.where(pred == -1, 1, 0)

print("\nAnomaly prediction completed.")
print(user_df[['AccountID', 'anomaly']].head())


Anomaly prediction completed.
   AccountID  anomaly
0          0        0
1          1        0
2          2        0
3          3        0
4          4        0


In [14]:
# ---------------------------------------------------------
# 4. Evaluate Model Quality (Distribution Inspection)
# ---------------------------------------------------------

print("\nAnomaly count:")
print(user_df['anomaly'].value_counts())

# There is no true label for user behavior, so we only inspect distribution.


Anomaly count:
anomaly
0    495
Name: count, dtype: int64


In [11]:
# ---------------------------------------------------------
# 5. Save the User Model
# ---------------------------------------------------------

joblib.dump(iso_user, "user_isolation_forest_model.pkl")

print("\nModel saved as: user_isolation_forest_model.pkl")


Model saved as: user_isolation_forest_model.pkl


In [12]:
# ---------------------------------------------------------
# 6. Save the Final Prediction CSV for Analysis
# ---------------------------------------------------------

user_df.to_csv("user_behavior_with_predictions.csv", index=False)

print("\nSaved: user_behavior_with_predictions.csv")
print("\nUSER MODEL TRAINING COMPLETED SUCCESSFULLY!")


Saved: user_behavior_with_predictions.csv

USER MODEL TRAINING COMPLETED SUCCESSFULLY!
