In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m25.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [3]:
import os
import shutil

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
)
from xgboost import XGBClassifier

from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch

from scipy.sparse import hstack, csr_matrix

import emoji
import regex as re
import joblib

In [9]:
df = pd.read_csv("/content/synthetic_realistic_bot_dataset.csv")
df.head()


Unnamed: 0,user_id,username,tweet,description,hashtags,retweet_count,mention_count,follower_count,following_count,verified,account_age_days,label
0,110650,trade1606,Earn money with betting. offer ends soon! #dis...,Official online course promotion account. give...,#giveaway #ad #sponsored #deal #trading,5,3,2,593,False,824,1
1,102041,sam493,"Another day, more food. Stay tired everyone.",Just a human who loves holiday.,#happy,4,1,358,287,False,964,0
2,108668,maria691,Life update: coding and confused vibes.,Just a human who loves birthday.,#weekend #music #friends,16,1,620,460,False,2027,0
3,101114,john,Life update: festival and stressed vibes.,Student | friends | trying my best.,#friends,0,0,1013,396,False,1772,0
4,113902,bot4165,"Join our NFT program today, sign up today! #sp...",Automated casino alerts and updates.,#giveaway #discount #deal,53,3,350,387,False,533,1


In [10]:
def clean_text(x):
    if pd.isna(x):
        return ""
    return str(x).lower()

df["tweet"] = df["tweet"].apply(clean_text)
df["description"] = df["description"].apply(clean_text)
df["hashtags"] = df["hashtags"].apply(clean_text)


In [12]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
minilm = AutoModel.from_pretrained(model_name)
minilm.to("cpu")
minilm.eval()

def embed_minilm(texts, batch=64, max_len=128):
    all_emb = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch)):
            batch_text = texts[i:i+batch].tolist()

            enc = tokenizer(batch_text, padding=True, truncation=True,
                            max_length=max_len, return_tensors="pt")

            out = minilm(**enc)
            emb = out.last_hidden_state.mean(dim=1)

            all_emb.append(emb.numpy())
    return np.vstack(all_emb)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [13]:
df["mini_text"] = df["tweet"] + " " + df["description"]
X_mini = embed_minilm(df["mini_text"])
X_mini.shape


100%|██████████| 313/313 [04:10<00:00,  1.25it/s]


(20000, 384)

In [14]:
def extract_emoji_behavior_features(text):
    text = str(text)
    emojis_list = [c for c in text if c in emoji.EMOJI_DATA]

    emoji_count = len(emojis_list)
    unique_emoji_count = len(set(emojis_list))
    emoji_density = emoji_count / max(1, len(text))

    longest_run = 1
    run = 1
    for i in range(1, len(emojis_list)):
        if emojis_list[i] == emojis_list[i - 1]:
            run += 1
        else:
            longest_run = max(longest_run, run)
            run = 1
    longest_run = max(longest_run, run)

    is_emoji_only = int(bool(re.fullmatch(r'[\p{Emoji_Presentation}\p{Emoji}\s]+', text)))

    return [
        emoji_count, unique_emoji_count,
        emoji_density, longest_run,
        is_emoji_only
    ]


In [15]:
meta_cols = [
    "retweet_count","mention_count","follower_count",
    "following_count","verified","account_age_days"
]

X_meta_num = df[meta_cols].copy()
X_meta_num["verified"] = X_meta_num["verified"].astype(int)

combined_text = df["tweet"] + " " + df["description"] + " " + df["hashtags"]
emoji_features = np.array([extract_emoji_behavior_features(t) for t in combined_text])

X_meta_full = np.hstack([X_meta_num.values, emoji_features])

scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta_full)

X_meta_scaled.shape


(20000, 11)

In [16]:
dense_features = np.hstack([X_mini, X_meta_scaled])
dense_sparse = csr_matrix(dense_features)

X = hstack([X_tfidf, dense_sparse])
y = df["label"].values

X.shape, y.shape


((20000, 12395), (20000,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    random_state=42, stratify=y
)


In [18]:
model = XGBClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    tree_method="hist"
)

model.fit(X_train, y_train)


In [19]:
pred = model.predict(X_test)
prob = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))
print("AUC:", roc_auc_score(y_test, prob))

cm = confusion_matrix(y_test, pred)
print("Confusion Matrix:\n", cm)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC: 1.0
Confusion Matrix:
 [[2000    0]
 [   0 2000]]


In [21]:
import pandas as pd

# Create 8 unknown users (4 human-like, 4 bot-like)
test_samples = [
    {
        "user_id": 90001,
        "username": "johndoe",
        "tweet": "Had a great day today! Loved spending time with my friends ❤️😊",
        "description": "Student | music lover | learning AI",
        "hashtags": "weekend happy",
        "retweet_count": 2,
        "mention_count": 0,
        "follower_count": 340,
        "following_count": 280,
        "verified": 0,
        "account_age_days": 2200,
        "label": -1   # unknown
    },
    {
        "user_id": 90002,
        "username": "foodqueen",
        "tweet": "Trying out a new pasta recipe today 🍝✨ so excited!",
        "description": "Food blogger.",
        "hashtags": "food cooking",
        "retweet_count": 5,
        "mention_count": 1,
        "follower_count": 1200,
        "following_count": 900,
        "verified": 0,
        "account_age_days": 1650,
        "label": -1
    },
    {
        "user_id": 90003,
        "username": "wallstreet_guy",
        "tweet": "Markets are moving wild today! Stay cautious, folks.",
        "description": "Finance | stocks | trading",
        "hashtags": "stocks finance",
        "retweet_count": 14,
        "mention_count": 3,
        "follower_count": 5600,
        "following_count": 4100,
        "verified": 1,
        "account_age_days": 2900,
        "label": -1
    },
    {
        "user_id": 90004,
        "username": "fitnesslifestyle",
        "tweet": "Morning workout done 💪🔥 Feeling pumped!",
        "description": "Gym lover. Healthy lifestyle.",
        "hashtags": "fitness health",
        "retweet_count": 3,
        "mention_count": 0,
        "follower_count": 780,
        "following_count": 650,
        "verified": 0,
        "account_age_days": 1400,
        "label": -1
    },
    # BOT-LIKE
    {
        "user_id": 90005,
        "username": "crypto_alerts_bot",
        "tweet": "🔥 Buy $BTC signals NOW! 99% profit GUARANTEED! Join the channel → http://fake-crypto.com",
        "description": "Crypto Signals Bot",
        "hashtags": "crypto bitcoin signals profit",
        "retweet_count": 57,
        "mention_count": 22,
        "follower_count": 12,
        "following_count": 2000,
        "verified": 0,
        "account_age_days": 30,
        "label": -1
    },
    {
        "user_id": 90006,
        "username": "promo_free_gifts",
        "tweet": "Win FREE iPhone! Just click here → http://fake-offers.com",
        "description": "We offer FREE gifts daily!!",
        "hashtags": "giveaway offer free iphone",
        "retweet_count": 120,
        "mention_count": 4,
        "follower_count": 33,
        "following_count": 1600,
        "verified": 0,
        "account_age_days": 11,
        "label": -1
    },
    {
        "user_id": 90007,
        "username": "news_updates_auto",
        "tweet": "Breaking: Click link for instant live updates → http://spam-news.com",
        "description": "Auto news aggregator",
        "hashtags": "breaking news update auto",
        "retweet_count": 31,
        "mention_count": 12,
        "follower_count": 58,
        "following_count": 5000,
        "verified": 0,
        "account_age_days": 90,
        "label": -1
    },
    {
        "user_id": 90008,
        "username": "best_deals_bot",
        "tweet": "✨ CRAZY DEAL! 80% OFF electronics today only → http://bestdeals.com",
        "description": "Deal notifier bot",
        "hashtags": "deal discount sale limited",
        "retweet_count": 78,
        "mention_count": 9,
        "follower_count": 40,
        "following_count": 4200,
        "verified": 0,
        "account_age_days": 54,
        "label": -1
    },
]

test_df = pd.DataFrame(test_samples)
test_df.to_csv("test_unknown_users.csv", index=False)
test_df


Unnamed: 0,user_id,username,tweet,description,hashtags,retweet_count,mention_count,follower_count,following_count,verified,account_age_days,label
0,90001,johndoe,Had a great day today! Loved spending time wit...,Student | music lover | learning AI,weekend happy,2,0,340,280,0,2200,-1
1,90002,foodqueen,Trying out a new pasta recipe today 🍝✨ so exci...,Food blogger.,food cooking,5,1,1200,900,0,1650,-1
2,90003,wallstreet_guy,"Markets are moving wild today! Stay cautious, ...",Finance | stocks | trading,stocks finance,14,3,5600,4100,1,2900,-1
3,90004,fitnesslifestyle,Morning workout done 💪🔥 Feeling pumped!,Gym lover. Healthy lifestyle.,fitness health,3,0,780,650,0,1400,-1
4,90005,crypto_alerts_bot,🔥 Buy $BTC signals NOW! 99% profit GUARANTEED!...,Crypto Signals Bot,crypto bitcoin signals profit,57,22,12,2000,0,30,-1
5,90006,promo_free_gifts,Win FREE iPhone! Just click here → http://fake...,We offer FREE gifts daily!!,giveaway offer free iphone,120,4,33,1600,0,11,-1
6,90007,news_updates_auto,Breaking: Click link for instant live updates ...,Auto news aggregator,breaking news update auto,31,12,58,5000,0,90,-1
7,90008,best_deals_bot,✨ CRAZY DEAL! 80% OFF electronics today only →...,Deal notifier bot,deal discount sale limited,78,9,40,4200,0,54,-1


In [22]:
test_df = pd.read_csv("test_unknown_users.csv")

test_df["tweet"] = test_df["tweet"].apply(clean_text)
test_df["description"] = test_df["description"].apply(clean_text)
test_df["hashtags"] = test_df["hashtags"].apply(clean_text)

# TF-IDF
test_df["tfidf_text"] = test_df["tweet"] + " " + test_df["description"] + " " + test_df["hashtags"]
X_tfidf_test = tfidf.transform(test_df["tfidf_text"])

# MiniLM
test_df["mini_text"] = test_df["tweet"] + " " + test_df["description"]
X_mini_test = embed_minilm(test_df["mini_text"])

# Metadata numeric
X_meta_test_num = test_df[meta_cols].copy()
X_meta_test_num["verified"] = X_meta_test_num["verified"].astype(int)

# Emoji
emoji_test_list = [extract_emoji_behavior_features(t)
                   for t in (test_df["tweet"] + " " + test_df["description"] + " " + test_df["hashtags"])]

emoji_test = np.array(emoji_test_list)

X_meta_test_full = np.hstack([X_meta_test_num.values, emoji_test])
X_meta_test_scaled = scaler.transform(X_meta_test_full)

dense_block_test = np.hstack([X_mini_test, X_meta_test_scaled])
dense_sparse_test = csr_matrix(dense_block_test)

X_test_full = hstack([X_tfidf_test, dense_sparse_test])

probs = model.predict_proba(X_test_full)[:,1]
preds = model.predict(X_test_full)

test_df["probability_bot"] = probs
test_df["prediction"] = preds

test_df


100%|██████████| 1/1 [00:00<00:00,  5.67it/s]


Unnamed: 0,user_id,username,tweet,description,hashtags,retweet_count,mention_count,follower_count,following_count,verified,account_age_days,label,tfidf_text,mini_text,probability_bot,prediction
0,90001,johndoe,had a great day today! loved spending time wit...,student | music lover | learning ai,weekend happy,2,0,340,280,0,2200,-1,had a great day today! loved spending time wit...,had a great day today! loved spending time wit...,0.003584,0
1,90002,foodqueen,trying out a new pasta recipe today 🍝✨ so exci...,food blogger.,food cooking,5,1,1200,900,0,1650,-1,trying out a new pasta recipe today 🍝✨ so exci...,trying out a new pasta recipe today 🍝✨ so exci...,0.15835,0
2,90003,wallstreet_guy,"markets are moving wild today! stay cautious, ...",finance | stocks | trading,stocks finance,14,3,5600,4100,1,2900,-1,"markets are moving wild today! stay cautious, ...","markets are moving wild today! stay cautious, ...",0.171942,0
3,90004,fitnesslifestyle,morning workout done 💪🔥 feeling pumped!,gym lover. healthy lifestyle.,fitness health,3,0,780,650,0,1400,-1,morning workout done 💪🔥 feeling pumped! gym lo...,morning workout done 💪🔥 feeling pumped! gym lo...,0.004146,0
4,90005,crypto_alerts_bot,🔥 buy $btc signals now! 99% profit guaranteed!...,crypto signals bot,crypto bitcoin signals profit,57,22,12,2000,0,30,-1,🔥 buy $btc signals now! 99% profit guaranteed!...,🔥 buy $btc signals now! 99% profit guaranteed!...,0.999942,1
5,90006,promo_free_gifts,win free iphone! just click here → http://fake...,we offer free gifts daily!!,giveaway offer free iphone,120,4,33,1600,0,11,-1,win free iphone! just click here → http://fake...,win free iphone! just click here → http://fake...,0.981244,1
6,90007,news_updates_auto,breaking: click link for instant live updates ...,auto news aggregator,breaking news update auto,31,12,58,5000,0,90,-1,breaking: click link for instant live updates ...,breaking: click link for instant live updates ...,0.999824,1
7,90008,best_deals_bot,✨ crazy deal! 80% off electronics today only →...,deal notifier bot,deal discount sale limited,78,9,40,4200,0,54,-1,✨ crazy deal! 80% off electronics today only →...,✨ crazy deal! 80% off electronics today only →...,0.978279,1


In [23]:
os.makedirs("saved_models", exist_ok=True)

joblib.dump(tfidf, "saved_models/tfidf_vectorizer.pkl")
joblib.dump(scaler, "saved_models/metadata_scaler.pkl")
model.save_model("saved_models/xgboost_bot_model.json")

tokenizer.save_pretrained("saved_models/minilm_tokenizer")
minilm.save_pretrained("saved_models/minilm_model")

shutil.make_archive("hybrid_bot_detector", "zip", "saved_models")

print("All models saved! Ready for Flask deployment.")


All models saved! Ready for Flask deployment.
