### NLP challenge

In [1]:
# baseline_simple.py
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score
import joblib
import re
import unicodedata
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# ---------- 1) Cargar datos ----------
df = pd.read_csv("train.csv")   # debe tener columnas "text" y "decade"
df = df.dropna(subset=["text", "decade"]).reset_index(drop=True)

# Si decade viene como string o necesita conversión, conviértelo a int:
# (según tu enunciado, 202 -> 2020; aquí mantendremos la etiqueta tal como está para el modelo)
df['decade'] = df['decade'].astype(int)

# ---------- 2) Preprocesamiento simple ----------

def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    # 2. Pasar a minúsculas
    text = text.lower()
    return text


df['text_clean'] = df['text'].apply(clean_text)

# ---------- 3) Train / Test split (estratificado) ----------
X = df['text_clean'].values
y = df['decade'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------- 4) Pipeline (TF-IDF word + char) + LogisticRegression ----------
word_tfidf = TfidfVectorizer(ngram_range=(1,2), analyzer='word', max_df=0.9, min_df=2)
char_tfidf = TfidfVectorizer(ngram_range=(3,5), analyzer='char', max_df=0.9, min_df=2)

# Hiperparámetros a probar
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10]
}

# Valores de C a probar
C_values = [0.01, 0.1, 1, 5, 10]

best_acc = 0
best_model = None
best_C = None

for c in C_values:
    clf = Pipeline([
        ('features', FeatureUnion([
            ('word', word_tfidf),
            ('char', char_tfidf)
        ])),
        ('clf', LinearSVC(class_weight='balanced', C=c, random_state=42, max_iter=5000))
    ])

    # Entrenar en train
    clf.fit(X_train, y_train)
    # Evaluar en test
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"C={c} -> Accuracy: {acc:.4f}, F1_macro: {f1:.4f}")

    # Guardar el mejor
    if acc > best_acc:
        best_acc = acc
        best_model = clf
        best_C = c

print("\nMejor modelo encontrado:")
print(f"C={best_C} -> Accuracy={best_acc:.4f}")

# Reporte detallado
y_pred = best_model.predict(X_test)
print("\nReporte por clase:\n")
print(classification_report(y_test, y_pred))



C=0.01 -> Accuracy: 0.2382, F1_macro: 0.2099
C=0.1 -> Accuracy: 0.2708, F1_macro: 0.2553
C=1 -> Accuracy: 0.2678, F1_macro: 0.2609
C=5 -> Accuracy: 0.2656, F1_macro: 0.2609
C=10 -> Accuracy: 0.2646, F1_macro: 0.2600

Mejor modelo encontrado:
C=0.1 -> Accuracy=0.2708

Reporte por clase:

              precision    recall  f1-score   support

         150       0.53      0.78      0.63       157
         151       0.40      0.76      0.52       162
         152       0.46      0.59      0.52       157
         153       0.41      0.64      0.50       155
         154       0.45      0.61      0.52       166
         155       0.37      0.31      0.34       167
         156       0.27      0.39      0.32       158
         157       0.29      0.27      0.28       166
         158       0.24      0.33      0.28       156
         159       0.24      0.35      0.29       160
         160       0.16      0.09      0.11       170
         161       0.18      0.14      0.16       157
         

In [None]:
# baseline_simple.py
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import unicodedata

# ---------- 1) Cargar datos ----------
df = pd.read_csv("train.csv")   # debe tener columnas "text" y "decade"
df = df.dropna(subset=["text", "decade"]).reset_index(drop=True)

# Convertir decade a int (si viene como string)
df['decade'] = df['decade'].astype(int)

# ---------- 2) Preprocesamiento simple ----------
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    text = text.lower()
    return text

df['text_clean'] = df['text'].apply(clean_text)

X = df['text_clean'].values
y = df['decade'].values

# ---------- 3) Pipeline (TF-IDF word + char) + LogisticRegression ----------
word_tfidf = TfidfVectorizer(ngram_range=(1,2), analyzer='word', max_df=0.9, min_df=2)
char_tfidf = TfidfVectorizer(ngram_range=(3,5), analyzer='char', max_df=0.9, min_df=2)

pipeline = Pipeline([
    ('features', FeatureUnion([('word', word_tfidf), ('char', char_tfidf)])),
    ('clf', LinearSVC(class_weight='balanced', C=0.1, random_state=42, max_iter=5000))
])

# ---------- 4) Entrenar con TODO ----------
pipeline.fit(X, y)

# ---------- 5) Guardar el modelo entrenado ----------
joblib.dump(pipeline, "baseline_tfidf_logreg_full.joblib")
print("Modelo guardado en baseline_tfidf_logreg_full.joblib")


In [2]:
# predict_eval.py
import pandas as pd
import joblib
import unicodedata

# ---------- 1) Cargar modelo entrenado ----------
pipeline = joblib.load("baseline_tfidf_logreg_full.joblib")

# ---------- 2) Cargar datos de evaluación ----------
df_eval = pd.read_csv("eval.csv")  # columnas: id,text

# ---------- 3) Preprocesamiento (mismo que en entrenamiento) ----------
def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", str(text))
    text = text.lower()
    return text

df_eval['text_clean'] = df_eval['text'].apply(clean_text)

# ---------- 4) Predecir ----------
X_eval = df_eval['text_clean'].values
preds = pipeline.predict(X_eval)

# ---------- 5) Guardar archivo de respuesta ----------
df_out = pd.DataFrame({
    "id": df_eval["id"],
    "answer": preds
})

df_out.to_csv("submission.csv", index=False)
print("Archivo de predicciones guardado en submission.csv")


Archivo de predicciones guardado en submission.csv
