In [2]:
# Cell 1: Importaciones
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib
from pathlib import Path


In [3]:
# Cell 2: Cargar dataset
df = sns.load_dataset("penguins").dropna()
df.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [4]:
# Cell 3: Preprocesamiento
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])  # target: 0, 1, 2

X = df.drop(columns=["species"])
X = pd.get_dummies(X, drop_first=True)  # codificar variables categóricas
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Cell 4: Definir modelos y parámetros
models = {
    "logistic_regression": (
        LogisticRegression(max_iter=1000),
        {"C": [0.1, 1.0, 10.0]}
    ),
    "random_forest": (
        RandomForestClassifier(),
        {"n_estimators": [50, 100]}
    ),
    "svc": (
        SVC(),
        {"C": [0.1, 1.0], "kernel": ["linear", "rbf"]}
    )
}


In [6]:
# Cell 5: Entrenamiento, búsqueda de hiperparámetros y guardado
output_dir = Path("/shared/models")
output_dir.mkdir(parents=True, exist_ok=True)

for name, (model, params) in models.items():
    print(f"\n🔍 Entrenando modelo: {name}")
    
    grid = GridSearchCV(model, param_grid=params, cv=3)
    grid.fit(X_train, y_train)

    print(f"✅ Mejor modelo para {name}: {grid.best_params_}")
    y_pred = grid.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Guardar el modelo
    model_path = output_dir / f"{name}.pkl"
    joblib.dump(grid.best_estimator_, model_path)
    print(f"💾 Guardado en: {model_path}")



🔍 Entrenando modelo: logistic_regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

✅ Mejor modelo para logistic_regression: {'C': 0.1}
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        31
           1       0.93      1.00      0.96        13
           2       1.00      1.00      1.00        23

    accuracy                           0.99        67
   macro avg       0.98      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67

💾 Guardado en: /shared/models/logistic_regression.pkl

🔍 Entrenando modelo: random_forest
✅ Mejor modelo para random_forest: {'n_estimators': 50}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

💾 Guardado en: /shared/models/random_for