In [None]:
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import psycopg2
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# Parámetros desde .env
POSTGRES_USER = "mlflow_user"
POSTGRES_PASSWORD = "mlflow_pass"
POSTGRES_DB = "mlflowdb"
POSTGRES_HOST = "postgres"  # nombre del servicio Docker
POSTGRES_PORT = "5432"

# URL de conexión para SQLAlchemy
db_uri = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"

engine = create_engine(db_uri)

# Cargar datos desde tabla
df = pd.read_sql("SELECT * FROM credit_data", engine)

df.head()


In [None]:
# One-hot encoding simple para education_level
df = pd.get_dummies(df, columns=["education_level"], drop_first=True)

# Separar features y target
X = df.drop(columns=["id", "credit_score"])
y = df["credit_score"]

# Dividir datos
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
def eval_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    return {
        "mae": mean_absolute_error(y_val, y_pred),
        "mse": mean_squared_error(y_val, y_pred),
        "r2": r2_score(y_val, y_pred)
    }

def train_model(model_name, model_class, params_grid):
    for params in params_grid:
        with mlflow.start_run(run_name=f"{model_name}_{params}"):
            model = model_class(**params)
            model.fit(X_train, y_train)

            metrics = eval_model(model, X_val, y_val)

            # Log params and metrics
            mlflow.log_params(params)
            mlflow.log_metrics(metrics)

            # Log model artifact
            mlflow.sklearn.log_model(model, "model", registered_model_name="CreditScoreModel")

            print(f"Run {mlflow.active_run().info.run_id} - {model_name} - {params} - R2: {metrics['r2']:.3f}")


In [None]:
# Configuración de MLflow (ya viene de variables de entorno en el contenedor)
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("CreditScorePrediction")

# Rejilla de hiperparámetros (mínimo 20 combinaciones)
rf_params = [
    {"n_estimators": n, "max_depth": d}
    for n in [10, 50, 100, 200]
    for d in [3, 5, 10, None]
]

train_model("RandomForest", RandomForestRegressor, rf_params[:20])


In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(experiment_ids=["1"], order_by=["metrics.r2 DESC"])

best_run = runs[0]
print(f"Best run ID: {best_run.info.run_id}")
print(best_run.data.metrics)
