In [10]:
import os
import random
import numpy as np

# Semillas y límites de hilos p/ reproducibilidad y estabilidad (especialmente en Windows)
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

random.seed(SEED)
np.random.seed(SEED)

print(f"Seeds fijadas (SEED={SEED}).")

Seeds fijadas (SEED=42).


In [None]:
# train_static_svm.py
import os
import pandas as pd
import numpy as np
import unicodedata
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer, f1_score, accuracy_score
import joblib

SEED = 42

# ---------- 1) Cargar datos ----------
df = pd.read_csv("train.csv")  # columnas: text, decade
df = df.dropna(subset=["text", "decade"]).reset_index(drop=True)
df["decade"] = df["decade"].astype(int)

# ---------- 2) Preprocesamiento coherente ----------
def clean_text(t: str) -> str:
    t = unicodedata.normalize("NFKC", str(t))
    return t.lower()

df["text_clean"] = df["text"].apply(clean_text)
X = df["text_clean"].values
y = df["decade"].values

# ---------- 3) Features estáticos (GANADOR) ----------
word_tfidf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=1,              # <- actualizado
    max_df=0.9,
    sublinear_tf=True,
    dtype=np.float32,
    strip_accents=None,
    stop_words=None
)
char_tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=3,              # <- actualizado
    max_df=0.9,
    sublinear_tf=True,
    dtype=np.float32
)

features = FeatureUnion(
    transformer_list=[("word", word_tfidf), ("char", char_tfidf)],
    transformer_weights={"word": 0.7, "char": 1.3},   # <- actualizado
    n_jobs=1
)

# ---------- 4) Clasificador estático (GANADOR) ----------
clf = LinearSVC(
    C=0.12,                       # <- actualizado
    class_weight="balanced",
    max_iter=5000,
    random_state=SEED
)

pipe = Pipeline([
    ("features", features),
    ("clf", clf)
])

# ---------- 5) Validación cruzada (determinista, ligera) ----------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "f1_macro": make_scorer(f1_score, average="macro", zero_division=0)
}

scores = cross_validate(
    pipe, X, y, cv=cv,
    scoring=scoring,
    n_jobs=1,                # reproducible/estable en Windows
    return_train_score=False
)

print("\nDesempeño estimado (5-fold CV) del setup estático:")
print(f"Accuracy : {np.mean(scores['test_accuracy']):.6f} ± {np.std(scores['test_accuracy']):.6f}")
print(f"F1_macro: {np.mean(scores['test_f1_macro']):.6f} ± {np.std(scores['test_f1_macro']):.6f}")

# ---------- 6) Reentrenar con TODO y guardar ----------
pipe.fit(X, y)
OUTFILE = "baseline_tfidf_logreg_full.joblib"  # mantiene el nombre esperado por predict_eval.py
joblib.dump(pipe, OUTFILE)
print(f"\nModelo final guardado en: {OUTFILE}")

# ---------- 7) Guardar metadatos (con versiones del entorno) ----------
meta = {
    "model_file": OUTFILE,
    "seed": SEED,
    "transformer_weights": {"word": 0.7, "char": 1.3},
    "word_tfidf": {"ngram_range": (1,2), "min_df": 1, "max_df": 0.9, "sublinear_tf": True, "strip_accents": None},
    "char_tfidf": {"ngram_range": (3,5), "min_df": 3, "max_df": 0.9, "sublinear_tf": True},
    "clf": {"type": "LinearSVC", "C": 0.12, "class_weight": "balanced", "max_iter": 5000, "random_state": SEED},
    "cv": {"folds": 5, "shuffle": True, "random_state": SEED},
    "cv_results": {
        "accuracy_mean": float(np.mean(scores['test_accuracy'])),
        "accuracy_std": float(np.std(scores['test_accuracy'])),
        "f1_macro_mean": float(np.mean(scores['test_f1_macro'])),
        "f1_macro_std": float(np.std(scores['test_f1_macro'])),
    }
}

# Capturar versiones reales del entorno
try:
    import importlib.metadata as md
    versions = {p: md.version(p) for p in ["scikit-learn","numpy","scipy","pandas","joblib"]}
except Exception:
    versions = {}
meta["versions"] = versions

import json, pathlib
pathlib.Path("model_meta.json").write_text(json.dumps(meta, indent=2, ensure_ascii=False))
print("Meta guardado en model_meta.json")


Desempeño estimado (5-fold CV) del setup estático:
Accuracy : 0.278827 ± 0.003821
F1_macro: 0.264477 ± 0.003627


In [12]:
import pandas as pd
import joblib
import unicodedata

# 1) Cargar modelo entrenado
pipeline = joblib.load("baseline_tfidf_logreg_full.joblib")

# 2) Cargar datos de evaluación
df_eval = pd.read_csv("eval.csv")  # columnas: id, text

# 3) Preprocesamiento consistente
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    return text.lower()

df_eval["text_clean"] = df_eval["text"].apply(clean_text)

# 4) Predecir
X_eval = df_eval["text_clean"].values
preds = pipeline.predict(X_eval)

# 5) Guardar submission (aseguramos ints nativos)
df_out = pd.DataFrame({"id": df_eval["id"], "answer": preds})
df_out["answer"] = df_out["answer"].astype(int)
df_out.to_csv("submission.csv", index=False)
print("Archivo de predicciones guardado en submission.csv")


Archivo de predicciones guardado en submission.csv
