### NLP challenge

In [2]:
# baseline_simple.py
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score
import joblib
import re
import unicodedata

# ---------- 1) Cargar datos ----------
df = pd.read_csv("train.csv")   # debe tener columnas "text" y "decade"
df = df.dropna(subset=["text", "decade"]).reset_index(drop=True)

# Si decade viene como string o necesita conversión, conviértelo a int:
# (según tu enunciado, 202 -> 2020; aquí mantendremos la etiqueta tal como está para el modelo)
df['decade'] = df['decade'].astype(int)

# ---------- 2) Preprocesamiento simple ----------

def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    # 2. Pasar a minúsculas
    text = text.lower()
    return text


df['text_clean'] = df['text'].apply(clean_text)

# ---------- 3) Train / Test split (estratificado) ----------
X = df['text_clean'].values
y = df['decade'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------- 4) Pipeline (TF-IDF word + char) + LogisticRegression ----------
word_tfidf = TfidfVectorizer(ngram_range=(1,2), analyzer='word', max_df=0.9, min_df=2)
char_tfidf = TfidfVectorizer(ngram_range=(3,5), analyzer='char', max_df=0.9, min_df=2)

pipeline = Pipeline([
    ('features', FeatureUnion([('word', word_tfidf), ('char', char_tfidf)])),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', random_state=42))
])

# ---------- 5) Entrenar ----------
pipeline.fit(X_train, y_train)

# ---------- 6) Evaluar ----------
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 macro:", f1_score(y_test, y_pred, average='macro'))
print("\nReporte por clase:\n")
print(classification_report(y_test, y_pred))

# ---------- 7) Guardar el modelo entrenado ----------
joblib.dump(pipeline, "baseline_tfidf_logreg.joblib")
print("Modelo guardado en baseline_tfidf_logreg.joblib")


Accuracy: 0.26349307435121794
F1 macro: 0.2538069741230763

Reporte por clase:

              precision    recall  f1-score   support

         150       0.70      0.72      0.71       157
         151       0.35      0.71      0.47       162
         152       0.53      0.52      0.52       157
         153       0.38      0.58      0.46       155
         154       0.47      0.58      0.52       166
         155       0.34      0.25      0.29       167
         156       0.31      0.39      0.34       158
         157       0.25      0.26      0.26       166
         158       0.22      0.26      0.24       156
         159       0.26      0.39      0.31       160
         160       0.10      0.07      0.08       170
         161       0.16      0.13      0.14       157
         162       0.15      0.15      0.15       162
         163       0.17      0.14      0.15       166
         164       0.23      0.17      0.19       161
         165       0.13      0.12      0.12       163
 

In [1]:
# baseline_simple.py
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import unicodedata

# ---------- 1) Cargar datos ----------
df = pd.read_csv("train.csv")   # debe tener columnas "text" y "decade"
df = df.dropna(subset=["text", "decade"]).reset_index(drop=True)

# Convertir decade a int (si viene como string)
df['decade'] = df['decade'].astype(int)

# ---------- 2) Preprocesamiento simple ----------
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    text = text.lower()
    return text

df['text_clean'] = df['text'].apply(clean_text)

X = df['text_clean'].values
y = df['decade'].values

# ---------- 3) Pipeline (TF-IDF word + char) + LogisticRegression ----------
word_tfidf = TfidfVectorizer(ngram_range=(1,2), analyzer='word', max_df=0.9, min_df=2)
char_tfidf = TfidfVectorizer(ngram_range=(3,5), analyzer='char', max_df=0.9, min_df=2)

pipeline = Pipeline([
    ('features', FeatureUnion([('word', word_tfidf), ('char', char_tfidf)])),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga', random_state=42))
])

# ---------- 4) Entrenar con TODO ----------
pipeline.fit(X, y)

# ---------- 5) Guardar el modelo entrenado ----------
joblib.dump(pipeline, "baseline_tfidf_logreg_full.joblib")
print("Modelo guardado en baseline_tfidf_logreg_full.joblib")


Modelo guardado en baseline_tfidf_logreg_full.joblib


In [2]:
# predict_eval.py
import pandas as pd
import joblib
import unicodedata

# ---------- 1) Cargar modelo entrenado ----------
pipeline = joblib.load("baseline_tfidf_logreg_full.joblib")

# ---------- 2) Cargar datos de evaluación ----------
df_eval = pd.read_csv("eval.csv")  # columnas: id,text

# ---------- 3) Preprocesamiento (mismo que en entrenamiento) ----------
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    text = text.lower()
    return text

df_eval['text_clean'] = df_eval['text'].apply(clean_text)

# ---------- 4) Predecir ----------
X_eval = df_eval['text_clean'].values
preds = pipeline.predict(X_eval)

# ---------- 5) Guardar archivo de respuesta ----------
df_out = pd.DataFrame({
    "id": df_eval["id"],
    "answer": preds
})

df_out.to_csv("submission.csv", index=False)
print("Archivo de predicciones guardado en submission.csv")


Archivo de predicciones guardado en submission.csv
