In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np



In [17]:
train_df = pd.read_csv("big_skibidi.csv")
test_df = pd.read_csv("test_skibidi.csv")

In [18]:
def aggregate_per_user(df):
    agg_funcs = {
        "video_id": pd.Series.nunique,  # num_videos_engaged
        "event_id": "count",            # total_events
        "engagement_duration": "mean",  # avg_engagement_duration
        "account_age_days": "first",
        "followers_count": "first",
        "following_count": "first",
        "profile_pic": "first",
        "bio_length": "first",
        "verified": "first",
        "location_consistent": "first",
        "timezone_offset": "first",
        "is_bot": "first",
        "bot_type": "first"
    }

    user_df = df.groupby("user_id").agg(agg_funcs).reset_index()
    user_df = user_df.rename(columns={
        "video_id": "num_videos_engaged",
        "event_id": "total_events",
        "engagement_duration": "avg_engagement_duration"
    })
    return user_df


In [19]:
train_users = aggregate_per_user(train_df)
test_users = aggregate_per_user(test_df)

In [20]:
drop_cols = ["user_id", "is_bot", "bot_type"]
X_train = train_users.drop(columns=drop_cols, errors="ignore")
y_train = train_users["is_bot"]

X_test = test_users.drop(columns=drop_cols, errors="ignore")
y_test = test_users["is_bot"]

In [21]:
for df in [X_train, X_test]:
    df["followers_following_ratio"] = df["followers_count"] / (df["following_count"] + 1)

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
contamination = y_train.mean()  # approximate bot ratio
iso = IsolationForest(
    n_estimators=200,
    max_samples="auto",
    contamination=contamination,
    random_state=42
)
iso.fit(X_train_scaled)

0,1,2
,n_estimators,200
,max_samples,'auto'
,contamination,np.float64(0.1)
,max_features,1.0
,bootstrap,False
,n_jobs,
,random_state,42
,verbose,0
,warm_start,False


In [None]:
y_pred = iso.predict(X_test_scaled)
y_pred = (y_pred == -1).astype(int)  # convert -1=bot → 1

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[879  21]
 [ 21  79]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       900
           1       0.79      0.79      0.79       100

    accuracy                           0.96      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.96      0.96      0.96      1000



In [25]:
import joblib

# ----------------------
# Save model + scaler
# ----------------------
joblib.dump(iso, "isolation_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ Model and scaler saved successfully!")


✅ Model and scaler saved successfully!


In [26]:
X_train.head()

Unnamed: 0,num_videos_engaged,total_events,avg_engagement_duration,account_age_days,followers_count,following_count,profile_pic,bio_length,verified,location_consistent,timezone_offset,followers_following_ratio
0,2,4,79.1225,205,6574,763,1,9,0,1,-5,8.604712
1,3,6,1.31,3,12,1278,0,0,0,0,-8,0.009382
2,5,12,5.416667,237,876,578,0,0,0,0,0,1.512953
3,1,2,53.89,67,14586,261,1,14,0,0,8,55.671756
4,5,12,3.5675,231,2624,677,1,45,0,1,-8,3.870206
