In [1]:
# Librerías estándar
import os
import pickle
import numpy as np
import pandas as pd

# Sklearn: Preprocesamiento, modelos y métricas
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

# Gensim para embeddings no contextuales
from gensim.models import Word2Vec, FastText

# Sentence Transformers para embeddings contextuales
from sentence_transformers import SentenceTransformer

# TensorFlow / Keras para Deep Learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Torch si quieres usar modelos basados en PyTorch
import torch





In [None]:
def shallow_pipeline(df, target_col):
    import os
    import pickle
    from collections import Counter
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import LinearSVC
    from sklearn.ensemble import RandomForestClassifier
    from xgboost import XGBClassifier
    from sklearn.metrics import accuracy_score, f1_score

    # Preparar texto
    if "text_joined" not in df.columns:
        df["text_joined"] = df["tokens"].apply(lambda x: " ".join(x))
    texts = df["text_joined"].astype(str).tolist()
    labels = df[target_col].tolist()

    # Filtrar clases con menos de 2 registros
    counts = Counter(labels)
    valid_classes = [c for c, cnt in counts.items() if cnt > 1]
    mask = [lbl in valid_classes for lbl in labels]
    texts = [t for t, m in zip(texts, mask) if m]
    labels = [l for l, m in zip(labels, mask) if m]

    # Codificar etiquetas
    le = LabelEncoder()
    y = le.fit_transform(labels)

    # Train/validation split
    X_train_text, X_val_text, y_train, y_val = train_test_split(
        texts, y, test_size=0.2, random_state=42, stratify=y
    )

    # Vectorización TF-IDF
    vectorizer = TfidfVectorizer(max_features=1500, stop_words='english', ngram_range=(1,2))
    X_train = vectorizer.fit_transform(X_train_text)
    X_val = vectorizer.transform(X_val_text)

    # Definición de modelos
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
        "LinearSVC": LinearSVC(),
        "Random Forest": RandomForestClassifier(n_estimators=150, n_jobs=-1),
       # "XGBoost": XGBClassifier(n_estimators=75, eval_metric="mlogloss", tree_method="hist", n_jobs=-1)
    }

    results = {}

    # Entrenar, evaluar y guardar modelos
    os.makedirs("data/models", exist_ok=True)
    for name, model in models.items():
        print(f"Entrenando {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        results[name] = {
            "Accuracy": accuracy_score(y_val, y_pred),
            "Macro-F1": f1_score(y_val, y_pred, average="macro")
        }
        # Guardar modelo
        pickle.dump(model, open(f"data/models/{name.replace(' ', '_').lower()}.pkl", "wb"))

    # Guardar vectorizador
    os.makedirs("data/features", exist_ok=True)
    pickle.dump(vectorizer, open("data/features/tfidf_vectorizer.pkl", "wb"))

    return results


In [4]:
df = pd.read_pickle("data/data_clean/train_tokenized.pkl")

In [None]:
# Shallow learning para topic
results_topic = shallow_pipeline(df, "topic")
print(pd.DataFrame(results_topic).T)


Entrenando Logistic Regression...
Entrenando LinearSVC...
Entrenando Random Forest...
Entrenando XGBoost...
