In [2]:
import pandas as pd
import uuid
import random
import json

INPUT_CSV = "cyberbullying_tweets.csv"

OUT_TWEETS = "tweets.csv"
OUT_EVENTS = "events.csv"
OUT_LABELS = "labels.csv"

N_TOTAL = 14_000
N_PER_CLASS = N_TOTAL // 2

EVENT_POOL = ["foul", "substitution", "caution", "sending-off", "score"]

# Per riproducibilità (puoi cambiare o rimuovere)
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

def normalize_label(x: str) -> str:
    # Tutto ciò che NON è esattamente "cyberbullying" diventa not_cyberbullying
    return "not_cyberbullying" if str(x).strip().lower() == "not_cyberbullying" else "cyberbullying"

def make_events():
    k = random.randint(3, 6)  # numero eventi tra 3 e 6
    return [random.choice(EVENT_POOL) for _ in range(k)]

df = pd.read_csv(INPUT_CSV)

if "tweet_text" not in df.columns or "cyberbullying_type" not in df.columns:
    raise ValueError("Il CSV deve contenere le colonne: tweet_text, cyberbullying_type")

# Normalizza label binaria
df = df.copy()
df["binary_label"] = df["cyberbullying_type"].apply(normalize_label)

# Split
df_cyb = df[df["binary_label"] == "cyberbullying"]
df_not = df[df["binary_label"] == "not_cyberbullying"]

if len(df_cyb) < N_PER_CLASS or len(df_not) < N_PER_CLASS:
    raise ValueError(
        f"Righe insufficienti per bilanciare: "
        f"cyberbullying={len(df_cyb)}, not_cyberbullying={len(df_not)}. "
        f"Servono almeno {N_PER_CLASS} per classe."
    )

# Campionamento bilanciato
sample_cyb = df_cyb.sample(n=N_PER_CLASS, random_state=RANDOM_SEED).reset_index(drop=True)
sample_not = df_not.sample(n=N_PER_CLASS, random_state=RANDOM_SEED).reset_index(drop=True)

sampled = pd.concat([sample_cyb, sample_not], ignore_index=True)
sampled = sampled.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)  # shuffle finale

# UUID per riga
sampled["uuid"] = [str(uuid.uuid4()) for _ in range(len(sampled))]

# tweets.csv
tweets_df = sampled[["uuid", "tweet_text"]].copy()
tweets_df.to_csv(OUT_TWEETS, index=False)

# labels.csv
labels_df = sampled[["uuid", "binary_label"]].copy()
labels_df.to_csv(OUT_LABELS, index=False)

# events.csv (events come JSON string)
events_records = []
for u in sampled["uuid"]:
    ev = make_events()
    events_records.append({"uuid": u, "events": json.dumps(ev)})
events_df = pd.DataFrame(events_records)
events_df.to_csv(OUT_EVENTS, index=False)

print("Creati:")
print(f"- {OUT_TWEETS} ({len(tweets_df)} righe)")
print(f"- {OUT_LABELS} ({len(labels_df)} righe)")
print(f"- {OUT_EVENTS} ({len(events_df)} righe)")



Creati:
- tweets.csv (14000 righe)
- labels.csv (14000 righe)
- events.csv (14000 righe)
