In [1]:
import pandas as pd
import re
import unicodedata
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import logging
logging.basicConfig(level=logging.INFO)

In [2]:
df = pd.read_csv("../data/processed/2_comments_youtube_refined.csv")

### 1) Base para la clasificación manual

In [None]:
# Muestreo balanceado
N = 9000

df_sampled = (
    df.groupby("condiciones_cuenta", group_keys=False)
    .apply(lambda x: x.sample(min(len(x), N), random_state=42))
    .reset_index(drop=True)
)

# Ordenar por bando y canal
df_sampled = df_sampled.sort_values(by=["condiciones_cuenta", "channel_title"]).reset_index(drop=True)

# Agregar columna para etiquetado manual
df_sampled["label_comentario"] = "" # ruso / neutro / ucraniano
df_sampled["es_sarcastico"] = "" # TRUE / FALSE
df_sampled["fuera_de_contexto"] = ""  # TRUE / FALSE


# Agregar ID único por fila
df_sampled["id_muestra"] = df_sampled.index + 1

# Guardar dataset limpio en la carpeta del proyecto
excel_path = "../data/processed/4_9000_comments_to_label.xlsx"


# Guardar como Excel
df_sampled.to_excel(excel_path, index=False)
print(f"Total de comentarios extraídos: {len(df_sampled)}")

In [None]:
df_ya_clasificado = pd.read_excel(excel_path)

In [5]:
df_ya_clasificado["label_comentario"].value_counts()

label_comentario
ruso         1208
ucraniano     368
neutro        301
Name: count, dtype: int64

In [6]:
# Filtrar solo los ya etiquetados
df_etiquetados = df_ya_clasificado[df_ya_clasificado["label_comentario"].notna() & (df_ya_clasificado["label_comentario"] != "")].copy()

# Limpiar comment_id de ambos datasets
df_etiquetados.loc[:, "comment_id"] = df_etiquetados["comment_id"].astype(str).str.strip()
df.loc[:, "comment_id"] = df["comment_id"].astype(str).str.strip()

# Obtener los restantes
comentarios_existentes = set(df_etiquetados["comment_id"])
df_restante = df[~df["comment_id"].isin(comentarios_existentes)]

print(f"Comentarios ya etiquetados: {len(df_etiquetados)}")
print(f"Comentarios restantes para muestreo: {len(df_restante)}")


Comentarios ya etiquetados: 1877
Comentarios restantes para muestreo: 111706


### 2) Detección de insultos / toxicidad


In [13]:
# 1) Función para normalizar (quitar tildes y pasar a minúsculas)
def normalize_text(text):
    text = text.lower()
    # descomponer caracteres Unicode y quitar marcas
    text = unicodedata.normalize('NFKD', text)
    text = ''.join(c for c in text if not unicodedata.combining(c))
    return text

In [14]:
# 1. Un set amplio de insultos comunes en español
insultos = {
    "idiota","imbécil","estúpido","estupida","tonto","tonta","bobo","boba",
    "gilipollas","gilipollas","gilipollas","gilipollas","burro","burra",
    "subnormal","retrasado","retrasada","tarado","tarada","mongólico",
    "mongolica","cretino","cretina","cabron","cabrón","cabrona","cabronazo",
    "pendejo","pendeja","maricón","marica","maricona","coño","joder","mierda",
    "puta","puto","maldito","maldita","malparido","malparida","zorra","perra",
    "prostituta","mamón","mamona","zopenco","zopenca","zángano","zorra","zoquete",
    "gilipuertas","capullo","capulla","troll","idiota","idiotas","inutil", "mentiroso",
    "gil",
    # contexto de guerra
    "traidor","genocida","asesino","criminal","invasor","nazis", "nazi", "ucranazi", "fascista",
    "marioneta","titere","miserable","cobarde","zombie", "rata", "ratas", "ladron",
    "ladrona", "complice", "fascista", "facho", "payaso", "otanazi", "ukronazi","ukrop"

}


In [15]:
# 3) Función para detectar insultos tras normalizar
def has_insulto(text):
    text_norm = normalize_text(text)
    tokens = re.findall(r"\b\w+\b", text_norm)
    return any(tok in insultos for tok in tokens)


In [16]:
# 4) Función para contar insultos
def count_insultos(text):
    text_norm = normalize_text(text)
    tokens = re.findall(r"\b\w+\b", text_norm)
    return sum(tok in insultos for tok in tokens)

In [17]:
# 5) Aplicarlo al DataFrame
df['insulto']   = df['comment'].astype(str).apply(has_insulto)
df['n_insultos'] = df['comment'].astype(str).apply(count_insultos)

In [18]:
# Conteos absolutos y relativos
counts = df['insulto'].value_counts().sort_index()
props  = df['insulto'].value_counts(normalize=True).sort_index() * 100

print("Counts:\n", counts)
print("\nPercentages:\n", props.round(2))

Counts:
 insulto
False    106766
True       6817
Name: count, dtype: int64

Percentages:
 insulto
False    94.0
True      6.0
Name: proportion, dtype: float64


In [19]:
# Exportar el DataFrame con los insultos detectados y las nuevas columnas para la clasificación de pro-ruso/pro-ucraniano

output_path = "../data/processed/3_comments_youtube_with_insults.csv"
df.to_csv(output_path, index=False)

print(f"Dataset limpio guardado en: {output_path}")
print(f"Comentarios únicos: {len(df)}")

Dataset limpio guardado en: ../data/processed/3_comments_youtube_with_insults.csv
Comentarios únicos: 113583


In [None]:
tabla_insulto = (
    df
    .groupby(['condiciones_cuenta', 'insulto'])
    .size()
    .unstack(fill_value=0)
)
print(tabla_insulto)

In [None]:
tabla_pct_insulto = tabla_insulto.div(tabla_insulto.sum(axis=1), axis=0).round(3)*100
print(tabla_pct_insulto)

### 3) Clasificación semi-automática de la muestra de 9000 registros

In [None]:
# Rutas
xlsx_sample_path = "../data/processed/4_9000_comments_to_label.xlsx"  # ~9.000
csv_insults_path = "../data/processed/3_comments_youtube_with_insults.csv"  # +115 k
xlsx_merged_out = "../data/processed/5_9000_comments_with_insults.xlsx"

In [21]:
# 1) Cargar
df9k = pd.read_excel(xlsx_sample_path)
df_ins = pd.read_csv(csv_insults_path)

# 2) Normalizar tipos/IDs
df9k["comment_id"] = df9k["comment_id"].astype(str).str.strip()
df_ins["comment_id"] = df_ins["comment_id"].astype(str).str.strip()

In [None]:
# 4) Merge LEFT para conservar exactamente los 9.000 registros
cols_keep_from_ins = ["comment_id", "insulto", "n_insultos"]
df9k_merged = df9k.merge(df_ins[cols_keep_from_ins], on="comment_id", how="left")

In [None]:
# 5) Validaciones de los registros
print("Filas muestra original:", len(df9k))
print("Filas tras merge:", len(df9k_merged))
print("Insultos NaN tras merge:", df9k_merged["insulto"].isna().sum())

Filas muestra original: 9000
Filas tras merge: 9000
Insultos NaN tras merge: 1


In [None]:
# 6) Guardamos el merge
df9k_merged.to_excel(xlsx_merged_out, index=False)
print("Merge guardado en:", xlsx_merged_out)

Merge guardado en: ../data/processed/comentarios_9000_con_insultos.xlsx


##### 3.b) Clasificación híbrida (Reglas + ML) usando insulto real

In [None]:
input_path = "../data/processed/5_9000_comments_with_insults.xlsx"
out_path   = "../data/processed/6_9000_comments_hibrid_class.xlsx"

In [None]:
df_hibrido = pd.read_excel(input_path)

In [None]:
# Normalizaciones mínimas
df_hibrido["comment"] = df_hibrido["comment"].astype(str)
df_hibrido["condiciones_cuenta"] = df_hibrido["condiciones_cuenta"].astype(str).str.strip().str.lower()

In [None]:
# Validación de etiquetas humanas
valid_labels = {"ruso","ucraniano","neutro"}
if "label_comentario" in df_hibrido.columns:
    df_hibrido["label_comentario"] = df_hibrido["label_comentario"].astype(str).str.strip().str.lower()



In [None]:
# Conversión de insulto a bool (por si es string por error)
df_hibrido["insulto"] = df_hibrido["insulto"].map({True: True, False: False, "True": True, "False": False}).fillna(False)


  df["insulto"] = df["insulto"].map({True: True, False: False, "True": True, "False": False}).fillna(False)


In [None]:
# Limpieza del texto y creación de contexto de canal
def clean_text(s: str) -> str:
    s = re.sub(r"http\S+", " ", s)
    s = re.sub(r"@[A-Za-z0-9_]+", " ", s)
    s = re.sub(r"[^A-Za-zÀ-ÿ0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

df_hibrido["comment_clean"] = df_hibrido["comment"].apply(clean_text)
df_hibrido["text_with_ctx"] = (("[canal:"+df_hibrido["condiciones_cuenta"]+"] ").fillna("") + df_hibrido["comment_clean"]).str.strip()

In [None]:
# Reglas de alta confianza

praise_channel = [
    "buen canal","buen análisis","buen analisis","me gusta el canal","me encant",
    "excelente video","gran resumen","buen resumen","muy claro","te felicito","gracias por el análisis",
    "buen contenido","siempre claros","gran trabajo"
]
russian_power = [
    "sarmat","misil","misiles","hipersónico","hipersonico","alcance nuclear",
    "no se metan con rusia","no se metan con los rusos","poder ruso","armamento ruso","potencia rusa"
]
peace_humanitarian = [
    "paz","alto el fuego","alto al fuego","basta de guerra","tragedia para ambos",
    "muertes","sufrimiento","fin de la guerra","paren la guerra","que pare la guerra"
]
attack_occident = [
    "otan","cnn","occidente","eeuu","estados unidos","propaganda occidental","mentira occidental",
    "ue propaganda","nato"
]
attack_russia = [
    "putin asesino","dictador","rusia invasora","invasión rusa","invasion rusa",
    "rusos mentirosos","propaganda rusa","kremlin miente","criminal de guerra"
]

In [33]:
def contains_any(s: str, bag) -> bool:
    s = s.lower()
    return any(k in s for k in bag)

def rule_classifier(row):
    text  = row["comment"].lower()
    canal = row["condiciones_cuenta"]  # 'pro-ucraniano' / 'pro-ruso' / 'neutral'
    insult = bool(row["insulto"])

    # Fuera de contexto: respetar marca si existe
    if "fuera_de_contexto" in row and str(row["fuera_de_contexto"]).lower() in {"sí","si","true"}:
        return None, "regla-fuera-contexto"

    # Insulto al presentador -> invierte bando del canal
    if insult and canal in {"pro-ucraniano","pro-ruso"}:
        inv = "ruso" if canal == "pro-ucraniano" else "ucraniano"
        return inv, "regla-insulto-inversion"

    # Elogio del canal -> sigue bando del canal
    if contains_any(text, praise_channel) and canal in {"pro-ucraniano","pro-ruso"}:
        return ("ucraniano" if canal=="pro-ucraniano" else "ruso"), "regla-elogio-canal"

    # Glorificación/validación poder militar ruso -> pro-ruso
    if contains_any(text, russian_power):
        if any(w in text for w in ["potente","poderoso","impresionante","temible","arras","reventar","alcance","amenaza","hongo","nuclear","golpear"]):
            return "ruso", "regla-poder-ruso"

    # Paz / tragedia sin culpas -> neutral
    if contains_any(text, peace_humanitarian) and not insult:
        if not contains_any(text, attack_occident) and not contains_any(text, attack_russia):
            return "neutro", "regla-paz-humanitaria"

    # Ataque a occidente en canal pro-ucraniano -> pro-ruso
    if canal == "pro-ucraniano" and contains_any(text, attack_occident):
        return "ruso", "regla-anti-occidente-en-canal-pro-ucr"

    # Ataque a Rusia/Putin en canal pro-ruso -> pro-ucraniano
    if canal == "pro-ruso" and contains_any(text, attack_russia):
        return "ucraniano", "regla-anti-rusia-en-canal-pro-ruso"

    # Negativo explícito a líderes
    if "putin" in text and any(w in text for w in ["asesino","dictador","criminal","títere","titere"]):
        return "ucraniano", "regla-putin-neg"
    if "zelensky" in text and any(w in text for w in ["títere","titere","payaso","actor","corrupto"]):
        return "ruso", "regla-zelensky-neg"

    # Canal neutral + elogio técnico
    if canal == "neutral" and contains_any(text, praise_channel):
        return "neutro", "regla-elogio-canal-neutral"

    return None, None

In [None]:
# Etiquetas humanas
is_humano = df_hibrido["label_comentario"].isin(valid_labels)

In [None]:
# Aplicar reglas sobre NO-humanos
df_hibrido["label_rule"], df_hibrido["regla_aplicada"] = None, None
mask_unlabeled = ~is_humano
df_hibrido.loc[mask_unlabeled, ["label_rule","regla_aplicada"]] = df_hibrido.loc[mask_unlabeled].apply(
    rule_classifier, axis=1, result_type="expand"
)

In [None]:
# Entrenamiento ML con solo humanos

df_h = df_hibrido[is_humano].copy()
X = df_h["text_with_ctx"]
y = df_h["label_comentario"]

test_size = 0.2 if y.value_counts().min() >= 2 else 0.1
Xtr, Xva, ytr, yva = train_test_split(
    X, y, test_size=test_size, stratify=y if y.value_counts().min() >= 2 else None, random_state=42
)

clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=35000,
        ngram_range=(1,2),
        min_df=2,
        sublinear_tf=True
    )),
    ("logreg", LogisticRegression(max_iter=300, C=4.0, class_weight="balanced", n_jobs=-1))
])

clf.fit(Xtr, ytr)

In [None]:
# Evaluación
if len(Xva) > 0:
    yhat = clf.predict(Xva)
    print("=== Evaluación (solo etiquetas humanas) ===")
    print(classification_report(yva, yhat, digits=3))


=== Evaluación (solo etiquetas humanas) ===
              precision    recall  f1-score   support

      neutro      0.446     0.417     0.431        60
        ruso      0.749     0.764     0.757       242
   ucraniano      0.425     0.419     0.422        74

    accuracy                          0.641       376
   macro avg      0.540     0.533     0.536       376
weighted avg      0.637     0.641     0.639       376



In [None]:
# 2) UMBRAL Y MARGEN (ABSTENCIÓN ML)
UMBRAL_PROBA = 0.55      # confianza mínima para aceptar la predicción ML
UMBRAL_MARGEN = 0.15     # diferencia mínima entre top1 y top2

# Función para aplicar umbral y margen
classes = clf.named_steps["logreg"].classes_

In [None]:
def ml_scores(X_series):
    """Devuelve pred_label, proba_max y margen(top1-top2) para cada texto de X_series."""
    P = clf.predict_proba(X_series)
    top1_idx = P.argmax(axis=1)
    top1 = classes[top1_idx]
    top1_p = P[np.arange(P.shape[0]), top1_idx]
    P_sorted = -np.sort(-P, axis=1)     # ordena cada fila desc
    margen = P_sorted[:, 0] - P_sorted[:, 1]
    return top1, top1_p, margen

to_pred_mask = mask_unlabeled & df_hibrido["label_rule"].isna()
if to_pred_mask.sum() > 0:
    y_ml, p_ml, m_ml = ml_scores(df_hibrido.loc[to_pred_mask, "text_with_ctx"])
    df_hibrido.loc[to_pred_mask, "label_ml"] = y_ml
    df_hibrido.loc[to_pred_mask, "ml_proba_max"] = p_ml
    df_hibrido.loc[to_pred_mask, "ml_margen"] = m_ml
else:
    df_hibrido["label_ml"] = None
    df_hibrido["ml_proba_max"] = np.nan
    df_hibrido["ml_margen"] = np.nan

In [None]:
# 3) DECISIÓN FINAL (con abstención)
def decide(row):
    # 1) Humano manda
    lab_h = row["label_comentario"] if row["label_comentario"] in valid_labels else None
    if lab_h:
        return lab_h, "humano"

    # 2) Reglas (alta precisión)
    lab_r = row["label_rule"]
    if isinstance(lab_r, str) and (lab_r in valid_labels):
        return lab_r, "regla"

    # 3) ML con umbral y margen
    lab_m = row.get("label_ml", None)
    pmax  = row.get("ml_proba_max", np.nan)
    margen = row.get("ml_margen", np.nan)
    if (lab_m in valid_labels) and (pmax >= UMBRAL_PROBA) and (margen >= UMBRAL_MARGEN):
        return lab_m, "automatica-ml"

    # 4) Sin evidencia suficiente -> sin clasificar
    return "", "sin-clasificar"

df_hibrido["label_final"], df_hibrido["clasificacion_origen"] = zip(*df_hibrido.apply(decide, axis=1))

In [None]:
# 4) RESUMEN Y GUARDADO

print("\n== Origen de clasificación ==")
print(df_hibrido["clasificacion_origen"].value_counts(dropna=False))
print("\n== Distribución label_final (incluye vacíos) ==")
print(df_hibrido["label_final"].value_counts(dropna=False))


== Origen de clasificación ==
clasificacion_origen
automatica-ml     4277
sin-clasificar    2846
humano            1877
Name: count, dtype: int64

== Distribución label_final (incluye vacíos) ==
label_final
ruso         4314
             2846
ucraniano    1031
neutro        809
Name: count, dtype: int64


------------ Clasificación base terminada --------------

In [None]:
df_hibrido.to_excel(out_path, index=False)
print("Guardado:", out_path)


Guardado: ../data/processed/comentarios_clasificados_9000_hibrido.xlsx
