In [3]:
# === BASELINE RÁPIDO: TF-IDF + LinearSVC (+ features) ===
import pandas as pd, numpy as np, re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

# 1) Cargar 9k etiquetados (usa tu archivo consolidado)
df9 = pd.read_excel("../data/processed/comentarios_9000_con_insultos.xlsx")
# si ya tienes un consolidado con label_final humano, usa ese mejor
if "label_final" not in df9.columns:
    df9["label_final"] = df9["label_comentario"]  # ajusta si corresponde

# (opcional) añadir auto-etiquetados de alta confianza de tu corrida TF
try:
    auto = pd.read_csv("../data/processed/7_predicciones_full_new.csv", usecols=[
        "comment","condiciones_cuenta","insulto","n_insultos","comment_text_length",
        "comment_likes","label_final","ml_proba_max","ml_margen"
    ])
    auto_hc = auto[(auto["label_final"].isin(["ruso","ucraniano","neutro"])) &
                   (auto["ml_proba_max"]>=0.80) & (auto["ml_margen"]>=0.30)]
    base = pd.concat([
        df9[["comment","condiciones_cuenta","insulto","n_insultos","comment_text_length","comment_likes","label_final"]],
        auto_hc
    ], ignore_index=True)
except Exception:
    base = df9[["comment","condiciones_cuenta","insulto","n_insultos","comment_text_length","comment_likes","label_final"]]

# Limpieza rápida
base = base.dropna(subset=["comment","condiciones_cuenta","label_final"]).copy()
base["comment"] = base["comment"].astype(str)
base["condiciones_cuenta"] = base["condiciones_cuenta"].astype(str).str.strip().str.lower()

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"http\S+"," ",s)
    s = re.sub(r"@[A-Za-z0-9_]+"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    return s

base["comment_clean"] = base["comment"].apply(clean_text)
base["text_with_ctx"] = "[canal:" + base["condiciones_cuenta"] + "] " + base["comment_clean"]

# Asegurar columnas numéricas
for col in ["insulto","n_insultos","comment_text_length","comment_likes"]:
    if col not in base.columns: base[col] = 0
    base[col] = pd.to_numeric(base[col], errors="coerce").fillna(0)

X_text = base["text_with_ctx"]
y      = base["label_final"].astype(str).str.strip().str.lower()

# 2) Partición
X_train, X_val, y_train, y_val = train_test_split(
    base, y, test_size=0.2, random_state=42, stratify=y
)

# 3) ColumnTransformer: texto + tabulares
text_vect = TfidfVectorizer(
    analyzer="word", ngram_range=(1,2), min_df=2, max_features=300000,
    # añadimos también char-ngrams vía un segundo vectorizador
)
char_vect = TfidfVectorizer(
    analyzer="char", ngram_range=(3,5), min_df=2, max_features=200000
)

# ColumnTransformer con 2 ramas de texto + ramas tabulares
pre = ColumnTransformer(
    transformers=[
        ("w_tfidf", text_vect, "text_with_ctx"),
        ("c_tfidf", char_vect, "text_with_ctx"),
        ("ohe_canal", OneHotEncoder(handle_unknown="ignore"), ["condiciones_cuenta"]),
        ("num", StandardScaler(with_mean=False), ["insulto","n_insultos","comment_text_length","comment_likes"])
    ],
    remainder="drop",
    sparse_threshold=0.3
)

# 4) Clasificador: LinearSVC calibrado (para probabilidades y umbral)
svc = LinearSVC(C=1.0, class_weight="balanced")
clf = CalibratedClassifierCV(estimator=svc, method="isotonic", cv=3)  # <-- usa 'estimator'

pipe = Pipeline([
    ("pre", pre),
    ("clf", clf)
])

# 5) Entrenar
pipe.fit(X_train, y_train)

# 6) Validación + reporte
y_pred = pipe.predict(X_val)
print(classification_report(y_val, y_pred, digits=3))

# 7) Guardar modelo
import joblib, os
os.makedirs("../models/baselines", exist_ok=True)
joblib.dump(pipe, "../models/baselines/stance_tfidf_svc_calibrated.joblib")
print("✅ Guardado baseline en ../models/baselines/stance_tfidf_svc_calibrated.joblib")




              precision    recall  f1-score   support

      neutro      0.928     0.873     0.899      5321
        ruso      0.968     0.983     0.976     36222
   ucraniano      0.911     0.856     0.883      4081

    accuracy                          0.959     45624
   macro avg      0.936     0.904     0.919     45624
weighted avg      0.958     0.959     0.959     45624

✅ Guardado baseline en ../models/baselines/stance_tfidf_svc_calibrated.joblib


In [15]:
# === BASELINE RÁPIDO (limpio): TF-IDF + LinearSVC calibrado (+ features) ===
import pandas as pd, numpy as np, re, os, joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

# ---------------------------
# CONFIG
# ---------------------------
PATH_9K = "../data/processed/comentarios_9000_con_insultos.xlsx"  # tu set humano
# Usa SOLO SI es el archivo "bueno" (el de 110k con métricas altas). Si no, déjalo en None.
GOOD_AUTO_PATH = "/predicciones_full.csv"
AUTO_TH_P = 0.80       # umbral proba
AUTO_TH_M = 0.30       # umbral margen
OUT_MODEL = "../models/baselines/stance_tfidf_svc_calibrated.joblib"

# ---------------------------
# CARGA Y LIMPIEZA
# ---------------------------
df9 = pd.read_excel(PATH_9K)

# Normaliza columna de etiqueta humana
if "label_final" in df9.columns:
    df9["y"] = df9["label_final"]
else:
    df9["y"] = df9["label_comentario"]

# Normaliza nombres
MAP_NORM = {
    "pro-ucraniano": "ucraniano",
    "pro-ruso": "ruso",
    "ucraniano": "ucraniano",
    "ruso": "ruso",
    "neutro": "neutro",
    "neutral": "neutro",
    "": np.nan,
    None: np.nan
}
df9["y"] = df9["y"].astype(str).str.strip().str.lower().map(MAP_NORM)

# columnas básicas
need_cols = ["comment","condiciones_cuenta","insulto","n_insultos","comment_text_length","comment_likes","y","comment_id"]
for c in need_cols:
    if c not in df9.columns:
        df9[c] = np.nan

# Limpieza rápida de texto
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"http\S+"," ",s)
    s = re.sub(r"@[A-Za-z0-9_]+"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    return s

df9 = df9.dropna(subset=["comment","condiciones_cuenta","y"]).copy()
df9["comment"] = df9["comment"].astype(str)
df9["condiciones_cuenta"] = df9["condiciones_cuenta"].astype(str).str.strip().str.lower()
df9["comment_clean"] = df9["comment"].apply(clean_text)
df9["text_with_ctx"] = "[canal:" + df9["condiciones_cuenta"] + "] " + df9["comment_clean"]

# Asegura numéricos
for col in ["insulto","n_insultos","comment_text_length","comment_likes"]:
    if col not in df9.columns: df9[col] = 0
    df9[col] = pd.to_numeric(df9[col], errors="coerce").fillna(0)

# Deduplicación para evitar fuga (muy importante)
# 1) por comment_id si existe
if "comment_id" in df9.columns:
    df9 = df9.drop_duplicates(subset=["comment_id"])
# 2) por texto limpio
df9 = df9.drop_duplicates(subset=["comment_clean"])

# ---------------------------
# (OPCIONAL) PSEUDO-LABELS DE ALTA CONFIANZA DEL ARCHIVO BUENO
# ---------------------------
dfs = [df9[["comment","condiciones_cuenta","insulto","n_insultos","comment_text_length","comment_likes","text_with_ctx","y","comment_id"]]]

if GOOD_AUTO_PATH is not None and os.path.exists(GOOD_AUTO_PATH):
    auto = pd.read_csv(GOOD_AUTO_PATH, low_memory=False)
    # normaliza
    for c in ["comment","condiciones_cuenta","label_final","ml_proba_max","ml_margen"]:
        if c not in auto.columns:
            auto[c] = np.nan
    auto = auto.dropna(subset=["comment","condiciones_cuenta","label_final"]).copy()
    auto["comment"] = auto["comment"].astype(str)
    auto["condiciones_cuenta"] = auto["condiciones_cuenta"].astype(str).str.strip().str.lower()
    auto["label_final"] = auto["label_final"].astype(str).str.strip().str.lower().map(MAP_NORM)

    # filtra alta confianza
    auto_hc = auto[
        auto["label_final"].isin(["ruso","ucraniano","neutro"]) &
        (pd.to_numeric(auto["ml_proba_max"], errors="coerce") >= AUTO_TH_P) &
        (pd.to_numeric(auto["ml_margen"], errors="coerce") >= AUTO_TH_M)
    ].copy()

    # mergea columnas numéricas si están, si no rellena
    for col in ["insulto","n_insultos","comment_text_length","comment_likes"]:
        if col not in auto_hc.columns: auto_hc[col] = 0
        auto_hc[col] = pd.to_numeric(auto_hc[col], errors="coerce").fillna(0)

    auto_hc["comment_clean"] = auto_hc["comment"].apply(clean_text)
    auto_hc["text_with_ctx"] = "[canal:" + auto_hc["condiciones_cuenta"] + "] " + auto_hc["comment_clean"]
    auto_hc = auto_hc.rename(columns={"label_final":"y"})

    # deduplicar y evitar colisión con humanos
    if "comment_id" in auto_hc.columns and "comment_id" in df9.columns:
        auto_hc = auto_hc[~auto_hc["comment_id"].isin(df9["comment_id"])]
    auto_hc = auto_hc.drop_duplicates(subset=["comment_clean"])

    dfs.append(auto_hc[["comment","condiciones_cuenta","insulto","n_insultos","comment_text_length","comment_likes","text_with_ctx","y","comment_id"]])

base = pd.concat(dfs, ignore_index=True)
base = base.dropna(subset=["text_with_ctx","y"])

print("Tamaño base de entrenamiento:", len(base))
print("Distribución de y:", base["y"].value_counts())

# ---------------------------
# SPLIT
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    base, base["y"], test_size=0.2, random_state=42, stratify=base["y"]
)

# ---------------------------
# PREPROCESADO
# ---------------------------
text_vect = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=2, max_features=300_000, sublinear_tf=True)
char_vect = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2, max_features=200_000)

pre = ColumnTransformer(
    transformers=[
        ("w_tfidf", text_vect, "text_with_ctx"),
        ("c_tfidf", char_vect, "text_with_ctx"),
        ("ohe_canal", OneHotEncoder(handle_unknown="ignore"), ["condiciones_cuenta"]),
        ("num", StandardScaler(with_mean=False), ["insulto","n_insultos","comment_text_length","comment_likes"])
    ],
    remainder="drop",
    sparse_threshold=0.3
)

# ---------------------------
# CLASIFICADOR
# ---------------------------
svc = LinearSVC(C=0.5, class_weight="balanced", max_iter=5000)  # C=0.5 + max_iter↑ para evitar warnings
clf = CalibratedClassifierCV(estimator=svc, method="isotonic", cv=3)  # probabilidades calibradas

pipe = Pipeline([("pre", pre), ("clf", clf)])

# ---------------------------
# ENTRENAR
# ---------------------------
pipe.fit(X_train, y_train)

# ---------------------------
# VALIDACIÓN
# ---------------------------
y_pred = pipe.predict(X_val)
print("\n=== EVAL HOLD-OUT (solo humano + pseudo-HC si aplicó) ===")
print(classification_report(y_val, y_pred, digits=3))

# (Opcional) Eval SOLO en humano puro (para la memoria)
mask_hum = X_val["comment_id"].notna()  # si tus humanos tienen comment_id y los pseudo no
if mask_hum.any():
    print("\n=== EVAL (subset humano puro) ===")
    print(classification_report(y_val[mask_hum], y_pred[mask_hum], digits=3))

# ---------------------------
# GUARDAR
# ---------------------------
os.makedirs(os.path.dirname(OUT_MODEL), exist_ok=True)
joblib.dump(pipe, OUT_MODEL)
print("✅ Guardado baseline en", OUT_MODEL)
print("Orden de clases del modelo:", list(pipe.classes_))


Tamaño base de entrenamiento: 1439
Distribución de y: y
ruso         899
ucraniano    308
neutro       232
Name: count, dtype: int64

=== EVAL HOLD-OUT (solo humano + pseudo-HC si aplicó) ===
              precision    recall  f1-score   support

      neutro      0.000     0.000     0.000        46
        ruso      0.624     0.994     0.767       180
   ucraniano      0.000     0.000     0.000        62

    accuracy                          0.622       288
   macro avg      0.208     0.331     0.256       288
weighted avg      0.390     0.622     0.479       288


=== EVAL (subset humano puro) ===
              precision    recall  f1-score   support

      neutro      0.000     0.000     0.000        46
        ruso      0.624     0.994     0.767       180
   ucraniano      0.000     0.000     0.000        62

    accuracy                          0.622       288
   macro avg      0.208     0.331     0.256       288
weighted avg      0.390     0.622     0.479       288



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ Guardado baseline en ../models/baselines/stance_tfidf_svc_calibrated.joblib
Orden de clases del modelo: ['neutro', 'ruso', 'ucraniano']


In [11]:
print("Clases del modelo:", pipe.classes_)


Clases del modelo: ['neutro' 'ruso' 'ucraniano']


Inferencia para el total de los casos

In [12]:
import pandas as pd, numpy as np, joblib, os, re

MODEL_PATH = "../models/baselines/stance_tfidf_svc_calibrated.joblib"
OUT_CSV    = "../data/processed/7_predicciones_full_sklearn.csv"
CHUNK      = 50_000
UMBRAL_PROBA = 0.55
UMBRAL_MARGEN = 0.15  # margen entre top1 y top2 usando probas calibradas
LABELS = ["ruso","ucraniano","neutro"]

pipe = joblib.load(MODEL_PATH)

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"http\S+"," ",s); s = re.sub(r"@[A-Za-z0-9_]+"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    return s

def infer_chunk(df):
    df = df.copy()
    df["comment"] = df["comment"].astype(str)
    df["condiciones_cuenta"] = df["condiciones_cuenta"].astype(str).str.strip().str.lower()
    for col in ["insulto","n_insultos","comment_text_length","comment_likes"]:
        if col not in df.columns: df[col]=0
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    df["comment_clean"] = df["comment"].apply(clean_text)
    df["text_with_ctx"] = "[canal:" + df["condiciones_cuenta"] + "] " + df["comment_clean"]

    # Probabilidades calibradas
    probas = pipe.predict_proba(df)
    top1   = probas.argmax(axis=1)
    top1_p = probas[np.arange(len(df)), top1]
    
    model_classes = pipe.classes_           # ✅ orden correcto
    df["label_ml"] = model_classes[top1]    # ✅ usa las clases reales
    
    # margen top1-top2
    sorted_p = -np.sort(-probas, axis=1)
    margen = sorted_p[:,0] - sorted_p[:,1]

    df["label_ml"]     = np.array(LABELS)[top1]
    df["ml_proba_max"] = top1_p
    df["ml_margen"]    = margen
    # abstención
    keep = (df["ml_proba_max"]>=UMBRAL_PROBA) & (df["ml_margen"]>=UMBRAL_MARGEN)
    df["label_final"] = np.where(keep, df["label_ml"], "")
    df["clasificacion_origen"] = np.where(keep, "automatica-ml-sklearn", "sin-clasificar")
    return df

# Escritura incremental
if os.path.exists(OUT_CSV): os.remove(OUT_CSV)
processed, milestone = 0, 50_000
for part in pd.read_csv("../data/processed/3_comments_youtube_with_insults.csv", chunksize=CHUNK):
    out = infer_chunk(part)
    out.to_csv(OUT_CSV, mode="a", header=not os.path.exists(OUT_CSV), index=False, encoding="utf-8-sig")
    processed += len(part)
    if processed >= milestone:
        print(f"➡️ Progreso: {processed:,} filas") 
        milestone += 50_000
print("✅ CSV listo:", OUT_CSV)


➡️ Progreso: 50,000 filas
➡️ Progreso: 100,000 filas
➡️ Progreso: 150,000 filas
➡️ Progreso: 200,000 filas
➡️ Progreso: 250,000 filas
➡️ Progreso: 300,000 filas
➡️ Progreso: 350,000 filas
✅ CSV listo: ../data/processed/7_predicciones_full_sklearn.csv


### Bots / Trolls: features + score + modelo simple

In [17]:
import pandas as pd, numpy as np, re
from datetime import datetime
from sklearn.ensemble import IsolationForest

# === Carga ===
df = pd.read_excel("predicciones_full.xlsx")
for col in ["comment_time"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# Normalizaciones rápidas
df["comment"] = df["comment"].astype(str)
if "insulto" not in df.columns: df["insulto"] = False
if "n_insultos" not in df.columns: df["n_insultos"] = 0
if "user_id" not in df.columns and "user_name" in df.columns:
    df["user_id"] = df["user_name"].fillna("")

# === utilidades para features ===
URL_RX = re.compile(r"http[s]?://|www\.", re.I)
MENT_RX = re.compile(r"@[\w_]+")
EMOJI_RX = re.compile(r"[^\w\s,.\-¡!¿?\(\)\"'@#:/]")  # aproximación

def frac_pattern(s, rx):
    m = rx.findall(s)
    return len(m)

def frac_emojis(s):
    # proporción aproximada de caracteres "no alfanuméricos estándar"
    if not s: return 0.0
    return len(EMOJI_RX.findall(s)) / max(1,len(s))

def lang_switch_ratio(texts):
    # proxy simple: cantidad de alfabetos/acentos distintos
    # (si ya tenés idioma por comentario, mejor usa % de cambio de idioma por usuario)
    return np.mean([len(set(re.findall(r"[A-Za-zÀ-ÿ]+", t)))==0 for t in texts])

# === features por usuario ===
def build_user_features(df):
    g = df.groupby("user_id", dropna=False)

    feats = pd.DataFrame({
        "user_id": g.size().index,
        "n_comments": g.size().values,
        "n_videos": g["video_title"].nunique().values if "video_title" in df.columns else g.size().values,
        "n_channels": g["channel_title"].nunique().values if "channel_title" in df.columns else g.size().values,
        "mean_len": g["comment"].apply(lambda s: np.mean([len(x) for x in s])).values,
        "dup_ratio": g["comment"].apply(lambda s: 1 - (len(set(s)) / max(1,len(s)))).values,  # 0=todo único, 1=todo duplicado
        "url_rate": g["comment"].apply(lambda s: np.mean([bool(URL_RX.search(x)) for x in s])).values,
        "mention_rate": g["comment"].apply(lambda s: np.mean([bool(MENT_RX.search(x)) for x in s])).values,
        "emoji_frac": g["comment"].apply(lambda s: np.mean([frac_emojis(x) for x in s])).values,
        "insulto_rate": g["insulto"].mean().values,
        "n_insultos_mean": g["n_insultos"].mean().values,
    })

    # cadencia temporal
    if "comment_time" in df.columns:
        ts = g["comment_time"].apply(lambda s: np.array(sorted(pd.to_datetime(s, errors="coerce").dropna().astype("int64"))))
        # intervalo medio (en horas) entre comentarios
        def mean_gap_hours(arr):
            if len(arr) < 2: return np.nan
            gaps = np.diff(arr) / (1e9*3600)  # ns -> horas
            return float(np.mean(gaps))
        feats["mean_gap_h"] = ts.apply(mean_gap_hours).values
        feats["burstiness"] = ts.apply(lambda a: 0 if len(a)<3 else float(np.std(np.diff(a))/(np.mean(np.diff(a))+1e-9))).values
    else:
        feats["mean_gap_h"] = np.nan
        feats["burstiness"] = np.nan

    # “juventud” de la cuenta si está disponible
    if "days_since_account_creation" in df.columns:
        feats["acct_age_days_median"] = g["days_since_account_creation"].median().values
    else:
        feats["acct_age_days_median"] = np.nan

    # diversidad temática (si hay tags)
    if "video_tags" in df.columns:
        feats["topic_diversity"] = g["video_tags"].apply(lambda s: len(set(";".join([str(x) for x in s]).lower().split(";")))).values
    else:
        feats["topic_diversity"] = np.nan

    # “agresividad” lexical simple
    feats["toxicity_proxy"] = feats["insulto_rate"]*0.6 + (feats["n_insultos_mean"]>0).astype(float)*0.4

    # intensidad de spam (URLs + duplicación + menciones)
    feats["spam_proxy"] = 0.5*feats["url_rate"] + 0.3*feats["dup_ratio"] + 0.2*feats["mention_rate"]

    # normalizaciones seguras
    for c in ["mean_len","mean_gap_h","emoji_frac","n_comments","n_videos","n_channels"]:
        feats[c] = feats[c].fillna(0)

    return feats

user_feats = build_user_features(df)

# === Scoring heurístico interpretable ===
# pesos simples; ajusta si querés
score = (
    0.30*user_feats["dup_ratio"] +
    0.20*user_feats["spam_proxy"] +
    0.15*(user_feats["n_comments"]>=50).astype(float) +
    0.10*(user_feats["mean_gap_h"]<=0.25).fillna(0).astype(float) +  # comenta cada ~15 min o menos
    0.10*(user_feats["emoji_frac"]>0.08).astype(float) +
    0.10*(user_feats["insulto_rate"]>0.2).astype(float) +
    0.05*(user_feats["acct_age_days_median"].fillna(365)<30).astype(float)
)
user_feats["bot_score_heur"] = score.clip(0,1)

# UMBRAL heurístico (conservador)
TH_HEUR = 0.55
user_feats["likely_bot_heur"] = user_feats["bot_score_heur"] >= TH_HEUR

# === Anomaly detection (complementario) ===
use_cols = ["dup_ratio","spam_proxy","n_comments","n_videos","mean_gap_h","emoji_frac","insulto_rate"]
X = user_feats[use_cols].replace([np.inf,-np.inf], np.nan).fillna(0).values
iso = IsolationForest(n_estimators=200, contamination=0.03, random_state=42)
pred_iso = iso.fit_predict(X)   # -1 anomalía
user_feats["likely_bot_iso"] = (pred_iso==-1)

# Fusión conservadora (OR)
user_feats["likely_bot"] = user_feats["likely_bot_heur"] | user_feats["likely_bot_iso"]

# Guardar
user_feats.to_csv("../data/processed/bot_scores_users.csv", index=False, encoding="utf-8-sig")
print(user_feats[["user_id","bot_score_heur","likely_bot_heur","likely_bot_iso","likely_bot"]].head())
print("✅ bot_scores_users.csv listo")


                    user_id  bot_score_heur  likely_bot_heur  likely_bot_iso  \
0  UC--80__NOagdmk5_yWVvMSg             0.1            False           False   
1  UC--IqT9PO6sVdn5gOJi6Vow             0.2            False           False   
2  UC--QB2gsVISlAwrJBW3Cj4Q             0.1            False           False   
3  UC--SiTzxl0sLBua11tGUf0g             0.1            False           False   
4  UC--TT_WNCBUDQSrODZYoSnw             0.0            False           False   

   likely_bot  
0       False  
1       False  
2       False  
3       False  
4       False  
✅ bot_scores_users.csv listo


In [18]:
import pandas as pd, numpy as np, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

df = pd.read_excel("predicciones_full.xlsx", usecols=["comment","condiciones_cuenta","channel_title"])
df["comment"] = df["comment"].astype(str)
df["condiciones_cuenta"] = df["condiciones_cuenta"].astype(str).str.strip().str.lower()

# (opcional) si ya tenés label_final del stance, úsalo en lugar de 'condiciones_cuenta'
USAR_LABEL_FINAL = False
if USAR_LABEL_FINAL and "label_final" in df.columns:
    df["bando"] = df["label_final"].replace({"pro-ruso":"ruso","pro-ucraniano":"ucraniano"})
else:
    # proxy por bando del canal
    df["bando"] = df["condiciones_cuenta"].map({"pro-ruso":"ruso","pro-ucraniano":"ucraniano"}).fillna("neutro")

# limpieza ligera (no muy agresiva para no perder señales)
def clean_min(s):
    s = s.lower()
    s = re.sub(r"http\S+"," ",s)
    s = re.sub(r"@[A-Za-z0-9_]+"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    return s

df["text"] = df["comment"].apply(clean_min)

# === función NMF por bando ===
def nmf_topics_for_side(df_side, n_topics=10, n_terms=12, min_df=10, max_df=0.5, max_features=200000):
    vect = TfidfVectorizer(
        analyzer="word", ngram_range=(1,2),
        min_df=min_df, max_df=max_df, max_features=max_features,
        sublinear_tf=True
    )
    X = vect.fit_transform(df_side["text"])
    nmf = NMF(n_components=n_topics, random_state=42, init="nndsvd", max_iter=400)
    W = nmf.fit_transform(X)     # documentos x topics
    H = nmf.components_          # topics x vocab

    vocab = np.array(vect.get_feature_names_out())
    topics = []
    for k in range(n_topics):
        top_idx = np.argsort(H[k])[::-1][:n_terms]
        terms = vocab[top_idx].tolist()
        topics.append({"topic_id": k, "top_terms": terms})

    # comentarios representativos por tema
    rep = []
    for k in range(n_topics):
        # top-5 docs por peso del tema k
        doc_idx = np.argsort(W[:,k])[::-1][:5]
        reps = df_side.iloc[doc_idx][["comment","channel_title"]].assign(topic_id=k, topic_score=W[doc_idx,k])
        rep.append(reps)
    rep_df = pd.concat(rep, ignore_index=True)

    topics_df = pd.DataFrame(topics)
    return topics_df, rep_df

# === Ejecutar por bando ===
out_topics = []
out_reps   = []

for side in ["ruso","ucraniano","neutro"]:
    sub = df[df["bando"]==side].copy()
    if len(sub) < 1000:
        print(f"⚠️ Muy pocos comentarios en {side} ({len(sub)}); ajustá min_df o une con otro período.")
        continue
    topics_df, rep_df = nmf_topics_for_side(sub, n_topics=12, n_terms=12, min_df=20, max_df=0.6)
    topics_df["bando"] = side
    rep_df["bando"] = side
    out_topics.append(topics_df)
    out_reps.append(rep_df)

topics_all = pd.concat(out_topics, ignore_index=True)
reps_all   = pd.concat(out_reps, ignore_index=True)

# Guardar para PowerBI
topics_all.to_csv("../data/processed/ejes_argumentativos_topics.csv", index=False, encoding="utf-8-sig")
reps_all.to_csv("../data/processed/ejes_argumentativos_representantes.csv", index=False, encoding="utf-8-sig")

print("✅ ejes_argumentativos_topics.csv (palabras por tema y bando)")
print("✅ ejes_argumentativos_representantes.csv (comentarios ejemplares por tema y bando)")


✅ ejes_argumentativos_topics.csv (palabras por tema y bando)
✅ ejes_argumentativos_representantes.csv (comentarios ejemplares por tema y bando)




1) Explorar resultados (rápido)
1.1. Resumen del CSV de predicciones (sklearn)

In [20]:
import pandas as pd

PRED_XLSX = "predicciones_full.xlsx"
pred = pd.read_excel(PRED_XLSX)

print("Filas:", len(pred))
print(pred["clasificacion_origen"].value_counts(dropna=False))
print(pred["label_final"].value_counts(dropna=False))

# Cobertura (no abstención)
cobertura = (pred["label_final"].astype(str) != "").mean()
print(f"Cobertura: {cobertura:.3f}")

# Distribución por bando de canal
print("\nDistribución por 'condiciones_cuenta' y label_final:")
print(pred.pivot_table(index="condiciones_cuenta", columns="label_final", aggfunc="size", fill_value=0))

# % insultos por tipo de comentario (si están las columnas)
if "insulto" in pred.columns:
    print("\nTasa de insulto por label_final:")
    print(pred.groupby("label_final")["insulto"].mean().sort_values(ascending=False))


Filas: 113585
clasificacion_origen
automatica-nn-tf    91552
regla               13475
sin-clasificar       8558
Name: count, dtype: int64
label_final
ruso         76564
neutro       14621
ucraniano    13842
NaN           8558
Name: count, dtype: int64
Cobertura: 1.000

Distribución por 'condiciones_cuenta' y label_final:
label_final         neutro   ruso  ucraniano
condiciones_cuenta                          
noticiero             2893  17249       4493
pro-ruso              8433  36422       6211
pro-ucraniano         3295  22893       3138

Tasa de insulto por label_final:
label_final
ucraniano    0.281534
ruso         0.035382
neutro       0.008207
Name: insulto, dtype: float64


In [21]:
bots = pd.read_csv("../data/processed/bot_scores_users.csv")
pred_b = pred.merge(bots[["user_id","likely_bot","bot_score_heur","likely_bot_heur","likely_bot_iso"]],
                    on="user_id", how="left")

print("\n% likely_bot por 'condiciones_cuenta':")
print(pred_b.groupby("condiciones_cuenta")["likely_bot"].mean().sort_values(ascending=False))

print("\nTop 15 usuarios sospechosos (por score heurístico):")
cols = ["user_id","bot_score_heur","likely_bot","likely_bot_heur","likely_bot_iso","dup_ratio","spam_proxy","n_comments","mean_gap_h"]
print(bots.sort_values("bot_score_heur", ascending=False)[cols].head(15))



% likely_bot por 'condiciones_cuenta':
condiciones_cuenta
pro-ucraniano    0.189246
pro-ruso         0.174937
noticiero        0.148150
Name: likely_bot, dtype: float64

Top 15 usuarios sospechosos (por score heurístico):
                        user_id  bot_score_heur  likely_bot  likely_bot_heur  \
36393  UCd3myKELT4Sbn0eiA1Li29Q        0.470000        True            False   
26713  UCT-vA-wEnH23u5xoAPZ3AMA        0.470000        True            False   
17833  UCIs5UKpQpQohvcej158G0tg        0.442000        True            False   
32970  UC_7_py5TIuUk-rGZD_eQM0A        0.440000        True            False   
25771  UCRvhxzKkHPR25yloobXiRrA        0.431096        True            False   
3117   UC2V85Go8-XoRuiufNDY6zcQ        0.424000        True            False   
48902  UCrIuQN7Re-9JzE5G2Sm0n5A        0.420000        True            False   
42233  UCjcgiY3CxQN6LsydZGzYg1A        0.417671        True            False   
17190  UCI8zD9BZVJ87yb7y-qUYmGg        0.408571        Tr

In [23]:
import re
import pandas as pd
import numpy as np

# 1) Tu lista base de stopwords (la que pasaste)
spanish_stopwords = [
    "de","la","que","el","en","y","a","los","del","se","las","por","un","para",
    "con","no","una","su","al","lo","como","más","pero","sus","le","ya","o","este",
    "sí","porque","esta","entre","cuando","muy","sin","sobre","también","me","hasta",
    "hay","donde","quien","desde","todo","nos","durante","todos","uno","les","ni",
    "contra","otros","ese","eso","ante","ellos","e","esto","mí","antes","algunos",
    "qué","unos","yo","otro","otras","otra","él","tanto","esa","estos","mucho",
    "quienes","nada","muchos","cual","poco","ella","estar","estas","algunas","algo",
    "nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros",
    "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
    "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
    "vuestra","vuestros","vuestras","esos","esas", "es", "ser", "fue", "son", "sido", "tiene", "tenido", "tienen", "tenía",
    "tenían", "tendría", "tendrían", "había", "habían", "habrá", "habrán", "habría", "son", "serán", "sería", "serían",
    "está", "están", "estuve", "estuvieron", "estaría", "estarían", "esté", "estén", "estuviera", "estuvieran", 
    "mas", "q", "solo", "tiene", "tienen", "si", "no", "estan", "va", "ha", "vivo", "viva", "ahora", "asi", "aqui", "ahi", 
    "siempre", "nunca", "ya", "sabe", "misma", "tambien", "era", "ve", "dio", "fueron", "toda", "misma", "hacer", "entonces", 
    "iran", "cosa", "dice", "querer", "poder",  "haber",  "ver", "tener", "dar", "deber", "creer", "saber", "seguir", "mismo", "bien", 
    "ir", "ganar", "perder", "dejar",  "decir"
]

# 2) Stopwords “meta YouTube” y ruidos comunes en comentarios
yt_stop = [
    "video","canal","like","suscribete","suscribirse","suscríbete","link",
    "gente","personas","hola","gracias","gracia","amigo","amigos","comenta",
    "comentario","comentarios","ver","viendo","visto","nuevo","noticia","noticias"
]

# 3) Lista de términos que NO queremos eliminar (claves del conflicto)
PROTECT = {"ucrania","ucraniano","ucranianos","rusia","ruso","rusos",
           "putin","zelensky","nato","otan","eeuu","estados","unidos","kiev","crimea","donbas","donbass","kherson","kharkiv"}

def build_stopwords_auto(corpus_series: pd.Series, k_top: int = 1000):
    """
    Extrae las k palabras más frecuentes del corpus y las suma a las stopwords,
    excluyendo términos protegidos (PROTECT) y palabras con dígitos.
    """
    # tokenización simple
    toks = corpus_series.str.lower().str.findall(r"[a-záéíóúñü]+")
    freq = {}
    for ts in toks.dropna():
        for t in ts:
            if any(ch.isdigit() for ch in t): 
                continue
            freq[t] = freq.get(t, 0) + 1
    # top-k menos informativos
    common = [w for w, _ in sorted(freq.items(), key=lambda x: x[1], reverse=True)[:k_top]]
    # ensamblar stopwords ampliadas
    base = set(spanish_stopwords) | set(yt_stop)
    extra = [w for w in common if (w not in PROTECT) and (len(w) > 2)]
    return sorted(base | set(extra))

# 4) Construye la lista final de stopwords a partir de tu DataFrame de textos LIMPIOS
#    Usa el mismo df/texto que alimenta al NMF (p.ej. df["text"])
#    Ejemplo: stopwords_final = build_stopwords_auto(df["text"], k_top=800)


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def nmf_topics_for_side(df_side, n_topics=12, n_terms=12, min_df=20, max_df=0.6, max_features=200000):
    # stopwords automáticas basadas en el subcorpus de ese bando
    stopwords_final = build_stopwords_auto(df_side["text"], k_top=800)

    vect = TfidfVectorizer(
        analyzer="word",
        ngram_range=(1,2),
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        sublinear_tf=True,
        stop_words=stopwords_final,
        token_pattern=r"(?u)\b\w\w+\b"   # evita tokens de 1 carácter
    )
    X = vect.fit_transform(df_side["text"])
    nmf = NMF(n_components=n_topics, random_state=42, init="nndsvd", max_iter=500, alpha_W=0.0, alpha_H=0.0, l1_ratio=0.0)
    W = nmf.fit_transform(X)
    H = nmf.components_

    vocab = np.array(vect.get_feature_names_out())
    topics = []
    for k in range(n_topics):
        top_idx = np.argsort(H[k])[::-1][:n_terms]
        topics.append({"topic_id": k, "top_terms": vocab[top_idx].tolist()})

    # Representativos por tema
    rep_rows = []
    for k in range(n_topics):
        doc_idx = np.argsort(W[:, k])[::-1][:5]
        reps = df_side.iloc[doc_idx][["comment","channel_title"]].assign(topic_id=k, topic_score=W[doc_idx, k])
        rep_rows.append(reps)
    rep_df = pd.concat(rep_rows, ignore_index=True)

    topics_df = pd.DataFrame(topics)
    return topics_df, rep_df


In [25]:
import pandas as pd
import ast

TOPICS_PATH = "../data/processed/ejes_argumentativos_topics.csv"
REPS_PATH   = "../data/processed/ejes_argumentativos_representantes.csv"

topics = pd.read_csv(TOPICS_PATH)
reps   = pd.read_csv(REPS_PATH)

def parse_terms(x):
    if isinstance(x, list): return x
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            if isinstance(v, list): return [str(t) for t in v]
        except Exception:
            return [t.strip() for t in x.split(",") if t.strip()]
    return []

topics["bando"] = topics["bando"].astype(str).str.strip().str.lower()
reps["bando"]   = reps["bando"].astype(str).str.strip().str.lower()
topics["top_terms_list"] = topics["top_terms"].apply(parse_terms)
topics["topic_title"]    = topics["top_terms_list"].apply(lambda xs: ", ".join(xs[:3]))


In [26]:
# Resumen: nº de temas por bando
print("Temas por bando:")
print(topics.groupby("bando")["topic_id"].nunique(), "\n")

# Tabla “bonita” de términos por bando
def ver_topicos(bando:str, n_terms:int=12):
    b = bando.strip().lower()
    t = topics[topics["bando"]==b].sort_values("topic_id")
    if t.empty:
        print(f"(No hay temas para '{bando}')"); return
    print(f"=== {b.upper()} — {t['topic_id'].nunique()} temas ===")
    for _, row in t.iterrows():
        tid = int(row["topic_id"])
        terms = row["top_terms_list"][:n_terms]
        print(f"Topic {tid:02d}: " + ", ".join(terms))

ver_topicos("ruso")
ver_topicos("ucraniano")
ver_topicos("neutro")


Temas por bando:
bando
neutro       12
ruso         12
ucraniano    12
Name: topic_id, dtype: int64 

=== RUSO — 12 temas ===
Topic 00: que, no, se, lo, si, lo que, ya, le, no se, que no, hay, que se
Topic 01: viva, viva rusia, rusia viva, viva la, viva putin, rusia, yemen, viva yemen, viva el, palestina, que viva, russia
Topic 02: gracias, gracias por, por, gracias miguel, información, por la, la información, muchas gracias, muchas, por tu, miguel por, miguel gracias
Topic 03: la, otan, la otan, de la, de, guerra, la guerra, ucrania, en la, en, por la, con la
Topic 04: saludos, desde, saludos desde, saludos miguel, méxico, venezuela, miguel desde, miguel saludos, desde venezuela, desde méxico, argentina, desde argentina
Topic 05: los, son, de, de los, rusos, todos, los rusos, terroristas, todos los, son los, que los, con
Topic 06: el, en, de, del, mundo, en el, el mundo, su, al, con, pueblo, todo
Topic 07: excelente, análisis, excelente análisis, excelente información, siempre, como s