In [None]:
# setup_repro_and_versions.py
import os, random, numpy as np, platform
import sklearn, pandas as pd, scipy, joblib

SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED); np.random.seed(SEED)

# Limitar threads BLAS (estabilidad y reproducibilidad)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

print(f"Seeds fijadas (SEED={SEED}).")
print("POLICY: train solo con train.csv; eval solo para inferencia. Sin IDF transductivo ni pseudo-labels.")
print("Python   :", platform.python_version())
print("sklearn  :", sklearn.__version__)
print("numpy    :", np.__version__)
print("scipy    :", scipy.__version__)
print("pandas   :", pd.__version__)
print("joblib   :", joblib.__version__)

Seeds fijadas (SEED=42). Python 3.12.3


In [None]:
# train_final_S1.py
import pandas as pd, numpy as np, unicodedata, re, json, pathlib, joblib
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer, f1_score

SEED = 42

def clean_text(t: str) -> str:
    t = unicodedata.normalize("NFKC", str(t)).lower()
    t = re.sub(r"https?://\S+|www\.\S+", " URL ", t)
    t = re.sub(r"\d", "#", t)          # seguimos hasheando dígitos
    t = re.sub(r"\s+", " ", t).strip()
    return t

# 1) datos
df = pd.read_csv("train.csv").dropna(subset=["text","decade"]).reset_index(drop=True)
df["decade"] = df["decade"].astype(int)
df["text_clean"] = df["text"].apply(clean_text)
X, y = df["text_clean"].values, df["decade"].values

# 2) features (mejor S1: char (2,5), min_df=4; word igual)
word_tfidf = TfidfVectorizer(
    analyzer="word", ngram_range=(1,3), min_df=2, max_df=0.9,
    sublinear_tf=True, dtype=np.float32, strip_accents="unicode", stop_words=None
)
char_tfidf = TfidfVectorizer(
    analyzer="char", ngram_range=(2,5), min_df=4, max_df=0.9,
    sublinear_tf=True, dtype=np.float32
)

features = FeatureUnion(
    [("word", word_tfidf), ("char", char_tfidf)],
    transformer_weights={"word":0.6, "char":1.4},
    n_jobs=1
)

clf = LinearSVC(
    C=0.19, class_weight="balanced", max_iter=6000, random_state=SEED,
    loss="squared_hinge", multi_class="ovr", tol=1e-4
)

pipe = Pipeline([("features", features), ("clf", clf)])

# 3) CV breve y determinista (5-fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scoring = {"accuracy":"accuracy", "f1_macro":make_scorer(f1_score, average="macro", zero_division=0)}
scores = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=1, return_train_score=False)
print("\nCV (5-fold) config S1:")
print(f"Accuracy : {np.mean(scores['test_accuracy']):.6f} ± {np.std(scores['test_accuracy']):.6f}")
print(f"F1_macro: {np.mean(scores['test_f1_macro']):.6f} ± {np.std(scores['test_f1_macro']):.6f}")

# 4) Entrena todo y guarda
pipe.fit(X, y)
OUT = "tfidf_svc_word13md2_char25md4_C019_hash_w06_c14.joblib"
joblib.dump(pipe, OUT, compress=("xz", 3))
print(f"\nModelo final guardado en: {OUT}")

# 5) metadata enriquecida
features_fitted = pipe.named_steps["features"]
word_vec = dict(features_fitted.transformer_list)["word"]
char_vec = dict(features_fitted.transformer_list)["char"]

meta = {
  "model_file": OUT,
  "seed": SEED,
  "transformer_weights": {"word":0.6,"char":1.4},
  "word_tfidf": {
      "analyzer": "word", "ngram_range":[1,3], "min_df":2, "max_df":0.9,
      "sublinear_tf": True, "strip_accents": "unicode", "stop_words": None,
      "vocab_size": len(getattr(word_vec, "vocabulary_", {}))
  },
  "char_tfidf": {
      "analyzer": "char", "ngram_range":[2,5], "min_df":4, "max_df":0.9,
      "sublinear_tf": True, "vocab_size": len(getattr(char_vec, "vocabulary_", {}))
  },
  "n_features_total": int(features_fitted.transform(X[:1]).shape[1]),
  "train_rows": int(len(df)),
  "class_distribution": {int(k): int(v) for k, v in df["decade"].value_counts().sort_index().items()},
  "clf": {"type":"LinearSVC","C":0.19,"class_weight":"balanced","max_iter":6000,"random_state":SEED,
          "loss":"squared_hinge","multi_class":"ovr","tol":1e-4},
  "cv":{"folds":5,"shuffle":True,"random_state":SEED},
  "cv_results":{
    "accuracy_mean": float(np.mean(scores['test_accuracy'])),
    "accuracy_std" : float(np.std(scores['test_accuracy'])),
    "f1_macro_mean": float(np.mean(scores['test_f1_macro'])),
    "f1_macro_std" : float(np.std(scores['test_f1_macro'])),
  }
}
pathlib.Path("model_meta.json").write_text(json.dumps(meta, indent=2, ensure_ascii=False), encoding="utf-8")
print("Meta escrita en model_meta.json")



CV (5-fold) config final:
Accuracy : 0.287648 ± 0.003078
F1_macro: 0.276483 ± 0.003569

Modelo final guardado en: tfidf_svc_word13md2_char25md4_C017_hash_w06_c14.joblib
Meta escrita en model_meta.json


In [None]:
# predict_eval_final.py
import pandas as pd, joblib, unicodedata, re

MODEL_PATH = "tfidf_svc_word13md2_char25md4_C019_hash_w06_c14.joblib"
pipe = joblib.load(MODEL_PATH)

def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text)).lower()
    text = re.sub(r"https?://\S+|www\.\S+", " URL ", text)
    text = re.sub(r"\d", "#", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_eval = pd.read_csv("eval.csv").dropna(subset=["id","text"]).reset_index(drop=True)
df_eval["text_clean"] = df_eval["text"].apply(clean_text)

preds = pipe.predict(df_eval["text_clean"].values)
pd.DataFrame({"id": df_eval["id"], "answer": preds.astype(int)}).to_csv("submission.csv", index=False)
print("Archivo de predicciones guardado en submission.csv")

Archivo de predicciones guardado en submission.csv
