In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

class CleanBotDatasetGenerator:
    def __init__(self, seed=42):
        random.seed(seed)
        np.random.seed(seed)
        
        # Bot archetypes
        self.bot_types = {
            "crude_bot": {
                "weight": 0.35,
                "account_age_range": (0, 14),
                "followers_range": (0, 100),
                "following_range": (500, 2000),
                "profile_completeness": 0.2,
                "burst_probability": 0.8,
                "off_hours_activity": 0.9,
                "engagement_velocity": "very_fast",
            },
            "sophisticated_bot": {
                "weight": 0.25,
                "account_age_range": (30, 365),
                "followers_range": (200, 3000),
                "following_range": (100, 800),
                "profile_completeness": 0.7,
                "burst_probability": 0.4,
                "off_hours_activity": 0.3,
                "engagement_velocity": "fast",
            },
            "compromised_account": {
                "weight": 0.15,
                "account_age_range": (180, 1000),
                "followers_range": (500, 8000),
                "following_range": (200, 1500),
                "profile_completeness": 0.9,
                "burst_probability": 0.6,
                "off_hours_activity": 0.7,
                "engagement_velocity": "medium",
            },
            "purchased_followers": {
                "weight": 0.15,
                "account_age_range": (14, 180),
                "followers_range": (1000, 20000),
                "following_range": (50, 500),
                "profile_completeness": 0.8,
                "burst_probability": 0.5,
                "off_hours_activity": 0.5,
                "engagement_velocity": "medium",
            },
            "coordinated_inauthentic": {
                "weight": 0.10,
                "account_age_range": (7, 90),
                "followers_range": (100, 1000),
                "following_range": (200, 1000),
                "profile_completeness": 0.6,
                "burst_probability": 0.9,
                "off_hours_activity": 0.8,
                "engagement_velocity": "very_fast",
            }
        }

        # Engagement velocity patterns
        self.velocity_patterns = {
            "very_fast": (0.1, 2),
            "fast": (1, 10),
            "medium": (10, 120),
            "slow": (120, 3600),
            "human_like": (30, 1800)
        }

    def generate_user_profile(self, user_id, is_bot, bot_type=None):
        if is_bot and bot_type:
            bot_config = self.bot_types[bot_type]
            account_age = random.randint(*bot_config["account_age_range"])
            followers = random.randint(*bot_config["followers_range"])
            following = random.randint(*bot_config["following_range"])
            profile_pic = int(random.random() < bot_config["profile_completeness"])
            bio_length = int(np.random.exponential(20)) if profile_pic else 0
            verified = 0
            location_consistent = int(random.random() < 0.3)
        else:
            account_age = int(np.random.exponential(200)) + 30
            followers = int(np.random.lognormal(4, 1.5))
            following = int(np.random.lognormal(4, 1))
            profile_pic = int(random.random() < 0.85)
            bio_length = int(np.random.exponential(50)) if profile_pic else 0
            verified = int(random.random() < 0.02)
            location_consistent = int(random.random() < 0.9)
            bot_type = "human"
        
        return {
            "user_id": user_id,
            "account_age_days": int(account_age),
            "followers_count": int(followers),
            "following_count": int(following),
            "profile_pic": profile_pic,
            "bio_length": bio_length,
            "verified": verified,
            "location_consistent": location_consistent,
            "bot_type": bot_type,
            "is_bot": int(is_bot)
        }

    def generate_device_info(self, is_bot):
        if is_bot:
            if random.random() < 0.6:
                ip_prefix = random.choice(["185.220", "192.168", "10.0", "172.16"])
            else:
                ip_prefix = f"{random.randint(1,223)}.{random.randint(0,255)}"
            timezone_offset = random.choice([-8, -5, 0, 1, 8])
        else:
            ip_prefix = f"{random.randint(1,223)}.{random.randint(0,255)}"
            timezone_offset = random.randint(-12, 12)
        return {
            "ip_address": f"{ip_prefix}.{random.randint(1,254)}.{random.randint(1,254)}",
            "timezone_offset": timezone_offset
        }

    def generate_temporal_pattern(self, user_profile, num_events):
        base_time = datetime.now() - timedelta(days=random.randint(1, 30))
        timestamps = []
        if user_profile["is_bot"]:
            bot_config = self.bot_types[user_profile["bot_type"]]
            if random.random() < bot_config["burst_probability"]:
                burst_duration = random.randint(10, 300)
                for _ in range(num_events):
                    offset_minutes = random.randint(0, burst_duration)
                    timestamp = base_time + timedelta(minutes=offset_minutes)
                    if random.random() < bot_config["off_hours_activity"]:
                        timestamp = timestamp.replace(hour=random.randint(2, 6))
                    timestamps.append(timestamp)
            else:
                for _ in range(num_events):
                    days_offset = random.randint(0, 7)
                    minutes_offset = random.randint(0, 1440)
                    timestamps.append(base_time + timedelta(days=days_offset, minutes=minutes_offset))
        else:
            for _ in range(num_events):
                days_offset = int(np.random.exponential(2))
                hour = random.choices(range(24), weights=[0.1]*6 + [0.4,0.6,0.8,0.9,1.0,1.0,0.9,0.8,0.9,1.0,1.0,0.9,0.8,0.7,0.6,0.4,0.3,0.2])[0]
                minute = random.randint(0, 59)
                timestamps.append(base_time + timedelta(days=days_offset, hours=hour, minutes=minute))
        return sorted(timestamps)

    def generate_dataset(self, num_videos=20, num_users=200, bot_fraction=0.25, save_to_csv=True, filename="clean_bot_dataset.csv"):
        videos = [{"video_id": f"v{i+1}", "creator_id": f"creator_{i+1}"} for i in range(num_videos)]
        
        num_bots = int(num_users * bot_fraction)
        users = []
        for i in range(num_bots):
            bot_type = random.choices(list(self.bot_types.keys()), 
                                      weights=[self.bot_types[bt]["weight"] for bt in self.bot_types.keys()])[0]
            users.append(self.generate_user_profile(f"bot_user_{i+1}", True, bot_type))
        for i in range(num_users - num_bots):
            users.append(self.generate_user_profile(f"human_user_{i+1}", False))
        random.shuffle(users)

        events = []
        for uid, user in enumerate(users):
            device_info = self.generate_device_info(user["is_bot"])
            engaged_videos = random.sample(videos, k=random.randint(1, min(5, len(videos))))
            for video in engaged_videos:
                num_events = random.choice([1,2,3])
                for t in self.generate_temporal_pattern(user, num_events):
                    event_type = random.choices(["view","like","share","comment"], weights=[0.7,0.2,0.07,0.03])[0]
                    velocity_range = self.velocity_patterns["human_like"] if not user["is_bot"] else self.velocity_patterns[self.bot_types[user["bot_type"]]["engagement_velocity"]]
                    engagement_duration = random.uniform(*velocity_range)
                    events.append({
                        "event_id": f"e{len(events)+1}",
                        "timestamp": t.isoformat(),
                        "user_id": user["user_id"],
                        "is_bot": user["is_bot"],
                        "bot_type": user["bot_type"],
                        "account_age_days": user["account_age_days"],
                        "followers_count": user["followers_count"],
                        "following_count": user["following_count"],
                        "profile_pic": user["profile_pic"],
                        "bio_length": user["bio_length"],
                        "verified": user["verified"],
                        "location_consistent": user["location_consistent"],
                        "ip_address": device_info["ip_address"],
                        "timezone_offset": device_info["timezone_offset"],
                        "video_id": video["video_id"],
                        "creator_id": video["creator_id"],
                        "event_type": event_type,
                        "engagement_duration": round(engagement_duration, 2)
                    })
        
        df = pd.DataFrame(events)
        if save_to_csv:
            df.to_csv(filename, index=False)
            print(f"Dataset saved to {filename}, shape={df.shape}")
        return df


In [2]:
gen = CleanBotDatasetGenerator()
df = gen.generate_dataset(
    num_videos=20, 
    num_users=200, 
    bot_fraction=0.25, 
    save_to_csv=True, 
    filename="clean_skibidi_dataset.csv"
)


Dataset saved to clean_skibidi_dataset.csv, shape=(1271, 18)
