In [1]:
import random
import re
import spacy
import pandas as pd

# Load the SpaCy model for NER
nlp = spacy.load("en_core_web_sm")

In [2]:
# Expanded global entities
politicians_global = [
    "Biden", "Modi", "Merkel", "Macron", "Trudeau",
    "Zelensky", "Xi Jinping", "Putin", "Sunak", "Sanchez",
    "Ardern", "Bolsonaro", "López Obrador", "Erdogan", "Al-Sisi"
]

places_global = [
    "New York", "Delhi", "Berlin", "Paris", "Toronto",
    "Kyiv", "Beijing", "Moscow", "London", "Madrid",
    "Cairo", "Tokyo", "Sydney", "Mexico City", "Brasilia",
    "Istanbul", "Johannesburg", "Dubai", "Rome", "Seoul"
]

actions_global = [
    "crisis", "development", "security", "political discourse", "election",
    "healthcare reform", "climate action", "peace talks", "trade deal", "education reform",
    "tax policy", "immigration", "economic relief", "sanctions", "diplomacy"
]

hashtags_list_global = [
    "#WorldNews", "#Politics", "#GlobalUpdate", "#Economy", "#Security",
    "#Environment", "#HumanRights", "#HealthCare", "#Education", "#Peace",
    "#Election2024", "#ClimateAction", "#TradeTalks", "#ReliefFund", "#Diplomacy"
]


In [3]:
# Expanded global templates
templates_global = [
    "[POLITICIAN] has announced a new [ACTION] policy. Share your thoughts! #WorldNews #Politics",
    "Have you seen the developments over the past year? [POLITICIAN]'s efforts are trending! 😎 #GlobalUpdate",
    "A major [ACTION] project just launched in [PLACE]! Stay informed. #Economy #Development",
    "Breaking news: a special announcement by [POLITICIAN] is happening today. Stay tuned. 🗣️ #BreakingNews",
    "[POLITICIAN] has called for action on [ACTION] during the recent summit in [PLACE]. What are your views? #Diplomacy #GlobalNews",
    "In a surprising move, [POLITICIAN] supports [ACTION] in [PLACE]. Is this a new era of cooperation? #Peace #Policy",
    "[PLACE] is now the center of a major [ACTION] initiative led by [POLITICIAN]. #Development #Economy",
    "The new [ACTION] plan proposed by [POLITICIAN] is already stirring debate worldwide. Thoughts? #WorldNews",
    "Could [POLITICIAN]'s stance on [ACTION] in [PLACE] shift the global balance? Tune in. #Geopolitics #Update",
    "[POLITICIAN]'s recent [ACTION] policy is shaking things up. Are we seeing a turning point in [PLACE]? #BreakingNews",
    "Public opinion on [POLITICIAN]'s stance on [ACTION] is split. What's your take? #GlobalDebate #Politics",
    "[PLACE] braces for a new era as [POLITICIAN] pushes forward on [ACTION]. Thoughts? #Economy #Reform",
    "[POLITICIAN] and [PLACE] are joining forces on a groundbreaking [ACTION] agreement! #GlobalNews",
    "Today's big question: Will [POLITICIAN]'s new [ACTION] plan work? Sound off below! #PolicyDebate",
    "With [POLITICIAN]'s recent announcement on [ACTION], [PLACE] is taking center stage. #Update #WorldNews",
    "Is the new [ACTION] strategy by [POLITICIAN] the solution we've been waiting for? #Change #Politics",
    "[PLACE] is leading a major [ACTION] project with [POLITICIAN]. Can this impact global policy? #Insight",
    "The latest on [POLITICIAN]'s push for [ACTION] in [PLACE] is trending worldwide! #Policy #News",
    "Is [POLITICIAN]'s stance on [ACTION] changing the political landscape in [PLACE]? #Politics #Update",
    "Just in: [POLITICIAN] addresses [ACTION] concerns in [PLACE]. What's next? #BreakingNews",
    "Could [POLITICIAN]'s vision for [ACTION] transform [PLACE] as we know it? #Change #Politics",
    "A recent study on [POLITICIAN]'s [ACTION] policies in [PLACE] shows mixed results. #Insight #GlobalNews",
    "Developing story: [PLACE] responds to [POLITICIAN]'s call for [ACTION] reform. #Policy #Debate",
    "Massive [ACTION] projects are underway in [PLACE] with [POLITICIAN]'s backing. #Economy #WorldNews",
    "As [POLITICIAN] pushes for [ACTION], [PLACE] might see a historic shift. #Insight #Geopolitics",
    "[POLITICIAN]'s call for [ACTION] has [PLACE] on alert. What do you think? #Change #Policy",
    "Will [POLITICIAN]'s latest [ACTION] proposal impact global relations? #International #Diplomacy",
    "[POLITICIAN]'s [ACTION] strategy for [PLACE] is sparking discussions worldwide. #WorldNews #Update",
    "Leaders worldwide are responding to [POLITICIAN]'s views on [ACTION] in [PLACE]. #Diplomacy #Politics",
    "A closer look at [POLITICIAN]'s plans for [ACTION] and their implications for [PLACE]. #Policy #GlobalImpact",
]



In [4]:
# Function to generate a random tweet
def generate_global_tweet():
    template = random.choice(templates_global)
    tweet = template.replace("[POLITICIAN]", random.choice(politicians_global))
    tweet = tweet.replace("[PLACE]", random.choice(places_global))
    tweet = tweet.replace("[ACTION]", random.choice(actions_global))

    # Add random mentions and hashtags
    tweet += f" @{random.choice(politicians_global)} {random.choice(hashtags_list_global)} {random.choice(hashtags_list_global)}"
    return tweet


In [5]:
# Feature extraction function
def extract_features(tweet):
    # Mentions
    mentions = len(re.findall(r"@\w+", tweet))

    # Hashtags
    hashtags = len(re.findall(r"#\w+", tweet))

    # Named Entity Recognition (NER) to extract ORG and PERSON
    doc = nlp(tweet)
    org_count = len([ent for ent in doc.ents if ent.label_ == "ORG"])
    person_count = len([ent for ent in doc.ents if ent.label_ == "PERSON"])

    # ORG and PERSON percentage relative to the number of entities
    total_entities = len(doc.ents)
    org_percentage = (org_count / total_entities) * 100 if total_entities > 0 else 0
    person_percentage = (person_count / total_entities) * 100 if total_entities > 0 else 0

    # BotScoreBinary (example rule: bot if mentions + hashtags > 3)
    bot_score_binary = 1 if (mentions + hashtags) > 3 else 0

    return {
        "tweet": tweet,
        "mentions": mentions,
        "hashtags": hashtags,
        "ORG_percentage": org_percentage,
        "PERSON_percentage": person_percentage,
        "BotScoreBinary": bot_score_binary
    }

# Generate 100 synthetic global tweets and extract features
global_tweets_data = [extract_features(generate_global_tweet()) for _ in range(10000)]


In [8]:
# Convert to DataFrame
global_tweets_df = pd.DataFrame(global_tweets_data)

# Save to CSV file
global_tweets_df.to_csv('global_bot_data_100k.csv', index=False)
