In [19]:
import pandas as pd
import random

def csv_to_sql_random_creator(csv_file, table_name, sql_file):
    # Load CSV into DataFrame
    df = pd.read_csv(csv_file)

    with open(sql_file, "w", encoding="utf-8") as f:
        f.write(f"-- SQL generated from {csv_file}\n\n")

        for _, row in df.iterrows():
            video_id = row["video_id"]
            interaction_type = str(row["event_type"]).upper()  # Force uppercase
            engagement_duration = row["engagement_duration"]
            timestamp = row["timestamp"]  # Use timestamp from CSV

            # Assign random creator_id between 1 and 1000
            creator_id = random.randint(1, 1000)

            sql = (
                f"INSERT INTO {table_name} "
                f"(video_id, creator_id, interaction_type, timestamp, engagement_duration)\n"
                f"VALUES ('{video_id}', {creator_id}, '{interaction_type}', '{timestamp}', {engagement_duration});\n"
            )
            f.write(sql)

    print(f"✅ SQL file generated: {sql_file}")


# Example usage
csv_to_sql_random_creator(
    r"C:\Users\jeany\Desktop\NUS\Projects\TikTokJam-2025\ai\bot_detection\clean_bot_dataset.csv",
    "interaction_event_entity",
    "output_new.sql"
)


✅ SQL file generated: output_new.sql


In [4]:
import requests

# The Flask server URL
# BASE_URL = "http://127.0.0.1:5000/admin/evaluate-video"
BASE_URL = "https://tiertok-ai-server.onrender.com/admin/evaluate-video"

# Example video URL
video_url = "b58c3991_tiktok_7480905344793791762_video.mp4"
try:
    # Pass the video URL as a query parameter
    response = requests.get(BASE_URL, params={"video_id": video_url})
    
    # Check for successful response
    if response.status_code == 200:
        data = response.json()
        print(f"Quality score: {data.get('quality_score')}")
        if 'error' in data:
            print(f"Error: {data['error']}")
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.text)
except Exception as e:
    print(f"An error occurred:{e}")
    raise


Quality score: 0.6


In [7]:
# Flask server URL
BASE_URL = "http://127.0.0.1:5000/admin/categorize-video"
video_id = video_url
try:
    # Send GET request with video_id as a query parameter
    response = requests.get(BASE_URL, params={"video_id": video_id})
    
    # Check response
    if response.status_code == 200:
        data = response.json()
        print("Categorization results:")
        print(data)
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.text)
except Exception as e:
    print(f"An error occurred: {e}")

Request failed with status code: 500
{"error":"axis 1 is out of bounds for array of dimension 1"}



In [13]:
import re
from collections import defaultdict

def extract_creators_and_videos(sql_file_path):
    creators_dict = defaultdict(list)

    # This regex captures everything inside VALUES(...)
    values_pattern = re.compile(r"VALUES\s*\((.*)\);", re.DOTALL)

    with open(sql_file_path, "r", encoding="utf-8") as f:
        for line in f:
            match = values_pattern.search(line)
            if not match:
                continue

            values_str = match.group(1)
            
            # Split by commas that are not inside quotes
            parts = re.split(r",(?=(?:[^']*'[^']*')*[^']*$)", values_str.strip())

            # Clean whitespace and surrounding quotes
            parts = [p.strip().strip("'") for p in parts]

            if len(parts) < 3:
                continue

            video_id = parts[0]
            creator_id = parts[2]

            creators_dict[creator_id].append(video_id)

    return creators_dict


sql_file = "video.sql"  # your SQL file
creators_videos = extract_creators_and_videos(sql_file)

# Print results
print(creators_videos)


defaultdict(<class 'list'>, {'727': ['b58c3991_tiktok_7480905344793791762_video.mp4'], '656': ['312644b9_tiktok_7532793031649512712_video.mp4'], '925': ['24ddcf76_tiktok_7518294413584518418_video.mp4'], '120': ['4b3edb21_tiktok_7527430885956783382_video.mp4'], '303': ['c07dddab_tiktok_7449999917877120299_video.mp4'], '518': ['b9ab2b72_tiktok_7434424247029763374_video.mp4'], '861': ['faac9fd8_tiktok_7525274214937529622_video.mp4'], '139': ['f090a615_tiktok_7247868927445847301_video.mp4', '8c43606f_tiktok_7226890678834384171_video.mp4'], '897': ['89940286_tiktok_7502018523397459207_video.mp4'], '609': ['944f4610_tiktok_7505814798114884865_video.mp4'], '97': ['3757599c_tiktok_7541110263248997654_video.mp4'], '660': ['bfaec030_tiktok_7521694239118331149_video.mp4', 'bd2d2f3f_tiktok_7540356614256200982_video.mp4'], '494': ['49ce783d_tiktok_7522264619419766046_video.mp4'], '207': ['dbfb8e7d_tiktok_7510221822164602130_video.mp4', 'e48c46e6_tiktok_7532902237073460486_video.mp4'], '358': ['5614

In [16]:
def generate_dataset(
    creator_videos_map,  # dict: {creator_id: [video_id1, video_id2, ...]}
    num_users=200, 
    bot_fraction=0.25, 
    save_to_csv=True, 
    filename="clean_bot_dataset.csv"
):
    # Flatten into a list of {video_id, creator_id}
    videos = []
    for creator_id, vids in creator_videos_map.items():
        for v in vids:
            videos.append({"video_id": v, "creator_id": creator_id})

    num_videos = len(videos)

    # --- user generation ---
    num_bots = int(num_users * bot_fraction)
    users = []
    for i in range(num_bots):
        bot_type = random.choices(
            list(self.bot_types.keys()), 
            weights=[self.bot_types[bt]["weight"] for bt in self.bot_types.keys()]
        )[0]
        users.append(self.generate_user_profile(f"bot_user_{i+1}", True, bot_type))
    for i in range(num_users - num_bots):
        users.append(self.generate_user_profile(f"human_user_{i+1}", False))
    random.shuffle(users)

    events = []
    for user in users:
        device_info = self.generate_device_info(user["is_bot"])
        engaged_videos = random.sample(videos, k=random.randint(1, min(5, num_videos)))
        for video in engaged_videos:
            num_events = random.choice([1,2,3])
            for t in self.generate_temporal_pattern(user, num_events):
                event_type = random.choices(
                    ["view","like","share","comment"], 
                    weights=[0.7,0.2,0.07,0.03]
                )[0]
                velocity_range = (
                    self.velocity_patterns["human_like"] 
                    if not user["is_bot"] 
                    else self.velocity_patterns[self.bot_types[user["bot_type"]]["engagement_velocity"]]
                )
                engagement_duration = random.uniform(*velocity_range)
                events.append({
                    "event_id": f"e{len(events)+1}",
                    "timestamp": t.isoformat(),
                    "user_id": user["user_id"],
                    "is_bot": user["is_bot"],
                    "bot_type": user["bot_type"],
                    "account_age_days": user["account_age_days"],
                    "followers_count": user["followers_count"],
                    "following_count": user["following_count"],
                    "profile_pic": user["profile_pic"],
                    "bio_length": user["bio_length"],
                    "verified": user["verified"],
                    "location_consistent": user["location_consistent"],
                    "ip_address": device_info["ip_address"],
                    "timezone_offset": device_info["timezone_offset"],
                    "video_id": video["video_id"],
                    "creator_id": video["creator_id"],
                    "event_type": event_type,
                    "engagement_duration": round(engagement_duration, 2)
                })

    df = pd.DataFrame(events)
    if save_to_csv:
        df.to_csv(filename, index=False)
        print(f"Dataset saved to {filename}, shape={df.shape}")
    return df


In [17]:
df = generate_dataset(creators_videos, num_users=200, bot_fraction=0.2)

NameError: name 'random' is not defined