In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rba-dataset/LICENSE
/kaggle/input/rba-dataset/README.md
/kaggle/input/rba-dataset/RESULTS.md
/kaggle/input/rba-dataset/rba-dataset.csv
/kaggle/input/rba-dataset/images/rq1-general.png
/kaggle/input/rba-dataset/images/rq1-login-frequency.png
/kaggle/input/rba-dataset/images/rtts-global.png
/kaggle/input/rba-dataset/images/login-overview.png
/kaggle/input/rba-dataset/images/rtts-continents.png
/kaggle/input/rba-dataset/images/rq6-rtt-0_999.png
/kaggle/input/rba-dataset/images/rq2-attack-data.png


In [2]:
df = pd.read_csv("/kaggle/input/rba-dataset/rba-dataset.csv")
df.head()

Unnamed: 0,index,Login Timestamp,User ID,Round-Trip Time [ms],IP Address,Country,Region,City,ASN,User Agent String,Browser Name and Version,OS Name and Version,Device Type,Login Successful,Is Attack IP,Is Account Takeover
0,0,2020-02-03 12:43:30.772,-4324475583306591935,,10.0.65.171,NO,-,-,29695,Mozilla/5.0 (iPhone; CPU iPhone OS 13_4 like ...,Firefox 20.0.0.1618,iOS 13.4,mobile,False,False,False
1,1,2020-02-03 12:43:43.549,-4324475583306591935,,194.87.207.6,AU,-,-,60117,Mozilla/5.0 (Linux; Android 4.1; Galaxy Nexus...,Chrome Mobile 46.0.2490,Android 4.1,mobile,False,False,False
2,2,2020-02-03 12:43:55.873,-3284137479262433373,,81.167.144.58,NO,Vestland,Urangsvag,29695,Mozilla/5.0 (iPad; CPU OS 7_1 like Mac OS X) ...,Android 2.3.3.2672,iOS 7.1,mobile,True,False,False
3,3,2020-02-03 12:43:56.180,-4324475583306591935,,170.39.78.152,US,-,-,393398,Mozilla/5.0 (Linux; Android 4.1; Galaxy Nexus...,Chrome Mobile WebView 85.0.4183,Android 4.1,mobile,False,False,False
4,4,2020-02-03 12:43:59.396,-4618854071942621186,,10.0.0.47,US,Virginia,Ashburn,398986,Mozilla/5.0 (Linux; U; Android 2.2) Build/NMA...,Chrome Mobile WebView 85.0.4183,Android 2.2,mobile,False,True,False


In [3]:
df.columns


Index(['index', 'Login Timestamp', 'User ID', 'Round-Trip Time [ms]',
       'IP Address', 'Country', 'Region', 'City', 'ASN', 'User Agent String',
       'Browser Name and Version', 'OS Name and Version', 'Device Type',
       'Login Successful', 'Is Attack IP', 'Is Account Takeover'],
      dtype='object')

In [4]:
df.dtypes


index                         int64
Login Timestamp              object
User ID                       int64
Round-Trip Time [ms]        float64
IP Address                   object
Country                      object
Region                       object
City                         object
ASN                           int64
User Agent String            object
Browser Name and Version     object
OS Name and Version          object
Device Type                  object
Login Successful               bool
Is Attack IP                   bool
Is Account Takeover            bool
dtype: object

In [5]:
df.isna().mean().sort_values(ascending=False).head(10)


Round-Trip Time [ms]    0.959195
Region                  0.001516
City                    0.000275
Device Type             0.000049
User ID                 0.000000
Login Timestamp         0.000000
Country                 0.000000
index                   0.000000
IP Address              0.000000
ASN                     0.000000
dtype: float64

In [6]:
df.shape

(31269264, 16)

In [7]:
df["Login Timestamp"] = pd.to_datetime(df["Login Timestamp"], utc=True)

df["date"] = df["Login Timestamp"].dt.date
df["hour"] = df["Login Timestamp"].dt.hour
df["is_night"] = df["hour"].between(0, 5)


# aggregate per user per day
daily = df.groupby(["User ID", "date"]).agg(
    login_count=("Login Timestamp", "count"),
    success_rate=("Login Successful", "mean"),
    night_login_ratio=("is_night", "mean"),
    unique_ips=("IP Address", "nunique"),
    unique_countries=("Country", "nunique"),
    unique_asns=("ASN", "nunique"),
    unique_devices=("Device Type", "nunique"),
).reset_index()

daily.head(), daily.shape

(               User ID        date  login_count  success_rate  \
 0 -9223371191532286299  2021-01-12            1           0.0   
 1 -9223369357534132497  2020-09-13            1           1.0   
 2 -9223369089733265380  2020-04-22            1           0.0   
 3 -9223360723444354188  2020-04-11            1           1.0   
 4 -9223360723444354188  2020-07-15            1           1.0   
 
    night_login_ratio  unique_ips  unique_countries  unique_asns  \
 0                0.0           1                 1            1   
 1                0.0           1                 1            1   
 2                0.0           1                 1            1   
 3                0.0           1                 1            1   
 4                0.0           1                 1            1   
 
    unique_devices  
 0               1  
 1               1  
 2               1  
 3               1  
 4               1  ,
 (12090619, 9))

In [8]:
# per-user behavioral baselines
user_baseline = daily.groupby("User ID").agg(
    mean_logins=("login_count", "mean"),
    std_logins=("login_count", "std"),
    mean_night_ratio=("night_login_ratio", "mean"),
    mean_unique_ips=("unique_ips", "mean"),
    mean_unique_countries=("unique_countries", "mean"),
    mean_unique_devices=("unique_devices", "mean"),
).reset_index()

user_baseline.head(), user_baseline.shape

(               User ID  mean_logins  std_logins  mean_night_ratio  \
 0 -9223371191532286299         1.00         NaN               0.0   
 1 -9223369357534132497         1.00         NaN               0.0   
 2 -9223369089733265380         1.00         NaN               0.0   
 3 -9223360723444354188         1.75    0.957427               0.0   
 4 -9223358650992576877         1.00         NaN               0.0   
 
    mean_unique_ips  mean_unique_countries  mean_unique_devices  
 0              1.0                    1.0                  1.0  
 1              1.0                    1.0                  1.0  
 2              1.0                    1.0                  1.0  
 3              1.0                    1.0                  1.0  
 4              1.0                    1.0                  1.0  ,
 (4304857, 7))

In [9]:
# join baselines back to daily data
daily_with_base = daily.merge(user_baseline, on="User ID", how="left")

# handle missing std (users with only 1 day of history)
daily_with_base["std_logins"] = daily_with_base["std_logins"].fillna(1.0)

# deviation features
daily_with_base["z_logins"] = (
    daily_with_base["login_count"] - daily_with_base["mean_logins"]
) / daily_with_base["std_logins"]

daily_with_base["ip_ratio"] = (
    daily_with_base["unique_ips"] / daily_with_base["mean_unique_ips"]
)

daily_with_base["country_ratio"] = (
    daily_with_base["unique_countries"] / daily_with_base["mean_unique_countries"]
)

daily_with_base["device_ratio"] = (
    daily_with_base["unique_devices"] / daily_with_base["mean_unique_devices"]
)

daily_with_base["night_deviation"] = (
    daily_with_base["night_login_ratio"] - daily_with_base["mean_night_ratio"]
)

daily_with_base[
    ["z_logins", "ip_ratio", "country_ratio", "device_ratio", "night_deviation"]
].describe()

Unnamed: 0,z_logins,ip_ratio,country_ratio,device_ratio,night_deviation
count,9299874.0,12090620.0,12090620.0,12090580.0,12090620.0
mean,2.580874e-18,1.0,1.0,1.0,1.624658e-18
std,0.792715,0.1445405,0.02708991,0.1235195,0.1862069
min,-14.7287,0.1666667,0.4,0.0,-0.9565217
25%,-0.4830459,1.0,1.0,1.0,0.0
50%,-0.2672612,1.0,1.0,1.0,0.0
75%,0.0,1.0,1.0,1.0,0.0
max,12.6395,9.492228,4.107143,3.36,0.997006


In [10]:
#select deviation features
features = [
    "z_logins",
    "ip_ratio",
    "country_ratio",
    "device_ratio",
    "night_deviation"
]

X = daily_with_base[features].replace([np.inf, -np.inf], np.nan).fillna(0)

# scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train isolation forest
iso = IsolationForest(
    n_estimators=200,
    contamination=0.01,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_scaled)

# Anomaly scores (higher = more anomalous)
daily_with_base["anomaly_score"] = -iso.decision_function(X_scaled)

daily_with_base["anomaly_score"].describe()

count    1.209062e+07
mean    -2.847027e-01
std      8.032754e-02
min     -3.427585e-01
25%     -3.427585e-01
50%     -3.223228e-01
75%     -2.537144e-01
max      1.411484e-01
Name: anomaly_score, dtype: float64

In [11]:
# rank most anomalous user-days
alerts = daily_with_base.sort_values("anomaly_score", ascending=False)

alerts_cols = [
    "User ID", "date", "anomaly_score",
    "login_count", "z_logins",
    "unique_ips", "ip_ratio",
    "unique_countries", "country_ratio",
    "unique_devices", "device_ratio",
    "night_login_ratio", "night_deviation"
]

alerts[alerts_cols].head(20)

Unnamed: 0,User ID,date,anomaly_score,login_count,z_logins,unique_ips,ip_ratio,unique_countries,country_ratio,unique_devices,device_ratio,night_login_ratio,night_deviation
5313145,-1105875135061296654,2020-06-09,0.141148,7,3.931233,5,3.431373,1,0.921053,2,1.627907,0.0,-0.295238
3584149,-3755286854393492752,2020-12-09,0.139217,4,5.150186,3,2.861538,2,1.952756,2,1.952756,0.0,-0.262097
11359653,8105329987683138826,2020-08-04,0.139005,7,5.678123,5,4.07563,1,0.95098,2,1.94,0.142857,-0.222877
11359627,8105329987683138826,2020-06-26,0.139005,6,4.656271,4,3.260504,1,0.95098,2,1.94,0.166667,-0.199067
8632031,3948716539449848274,2020-09-26,0.138731,2,3.88057,2,1.888889,2,1.888889,2,1.888889,0.0,-0.647059
11966404,9032744688944604344,2020-05-19,0.138728,8,5.3511,6,5.121951,2,1.944444,2,1.842105,1.0,0.819048
4417744,-2475661498283514452,2020-06-18,0.13847,3,3.60804,2,1.958333,2,1.958333,2,1.972028,0.333333,-0.449173
4417803,-2475661498283514452,2020-10-13,0.13847,3,3.60804,2,1.958333,2,1.958333,2,1.972028,0.333333,-0.449173
6202935,251549835605001850,2020-05-13,0.138196,7,4.041242,3,2.571429,1,0.923077,2,1.846154,0.0,-0.083333
9309550,4980738343515589923,2020-09-01,0.137935,6,4.806346,3,2.785714,2,1.925926,2,1.925926,0.0,-0.192308


In [12]:
# attach Attack take over labels at user-day level
labels = (
    df.assign(date=df["Login Timestamp"].dt.date)
    .groupby(["User ID", "date"])
    .agg(
        ato=("Is Account Takeover", "max"),
        attack_ip=("Is Attack IP", "max")
    )
    .reset_index()
)

alerts_with_labels = alerts.merge(
    labels,
    on=["User ID", "date"],
    how="left"
)

alerts_with_labels[["anomaly_score", "ato", "attack_ip"]].head(20)

Unnamed: 0,anomaly_score,ato,attack_ip
0,0.141148,False,True
1,0.139217,False,False
2,0.139005,False,False
3,0.139005,False,False
4,0.138731,False,False
5,0.138728,False,False
6,0.13847,False,False
7,0.13847,False,False
8,0.138196,False,False
9,0.137935,False,False


In [13]:
top_1pct = alerts_with_labels.head(int(0.01 * len(alerts_with_labels)))

top_1pct["ato"].mean(), top_1pct["attack_ip"].mean()


(np.float64(0.00011579243379154054), np.float64(0.0610639670487817))

In [23]:
# Baseline ATO rate in entire dataset
baseline_ato_rate = labels["ato"].mean()

baseline_ato_rate


np.float64(1.149651643145814e-05)

In [22]:
top_01pct = alerts_with_labels.head(int(0.001 * len(alerts_with_labels)))
top_01pct["ato"].mean()


np.float64(0.00024813895781637717)

In [24]:
lift = top_01pct["ato"].mean() / baseline_ato_rate
lift


np.float64(21.583838834639486)

In [16]:
def explain_alert(row):
    reasons = []
    if row["z_logins"] > 3:
        reasons.append("login volume far above baseline")
    if row["ip_ratio"] > 2:
        reasons.append("multiple new IP addresses")
    if row["country_ratio"] > 1.5:
        reasons.append("logins from multiple countries")
    if row["device_ratio"] > 1.5:
        reasons.append("new devices observed")
    if row["night_deviation"] > 0.5:
        reasons.append("unusual nighttime activity")
    return "; ".join(reasons)

alerts_with_labels["alert_reason"] = alerts_with_labels.apply(explain_alert, axis=1)
alerts_with_labels[["anomaly_score", "alert_reason"]].head(10)


Unnamed: 0,anomaly_score,alert_reason
0,0.141148,login volume far above baseline; multiple new ...
1,0.139217,login volume far above baseline; multiple new ...
2,0.139005,login volume far above baseline; multiple new ...
3,0.139005,login volume far above baseline; multiple new ...
4,0.138731,login volume far above baseline; logins from m...
5,0.138728,login volume far above baseline; multiple new ...
6,0.13847,login volume far above baseline; logins from m...
7,0.13847,login volume far above baseline; logins from m...
8,0.138196,login volume far above baseline; multiple new ...
9,0.137935,login volume far above baseline; multiple new ...


In [17]:
alerts_with_labels[["anomaly_score","alert_reason","ato","attack_ip"]].head(20)


Unnamed: 0,anomaly_score,alert_reason,ato,attack_ip
0,0.141148,login volume far above baseline; multiple new ...,False,True
1,0.139217,login volume far above baseline; multiple new ...,False,False
2,0.139005,login volume far above baseline; multiple new ...,False,False
3,0.139005,login volume far above baseline; multiple new ...,False,False
4,0.138731,login volume far above baseline; logins from m...,False,False
5,0.138728,login volume far above baseline; multiple new ...,False,False
6,0.13847,login volume far above baseline; logins from m...,False,False
7,0.13847,login volume far above baseline; logins from m...,False,False
8,0.138196,login volume far above baseline; multiple new ...,False,False
9,0.137935,login volume far above baseline; multiple new ...,False,False


In [25]:
top_alerts = alerts_with_labels.sort_values("anomaly_score", ascending=False).head(500)
top_alerts.to_csv("top_alerts.csv", index=False)
