# TFM — Explicabilidad Avanzada (SHAP + PDP) y Productivización
Autor: **Marcos Sánchez Pozo**

Este notebook:
- Entrena (RandomForest y opcional XGBoost).
- Genera **SHAP** (summary + bar + dependence plots).
- Genera **PDP** (partial dependence de las top features).
- Define **`predict_salary`** y guarda artefactos en `artifacts/`.
- Incluye un **protitpo Gradio** (UI) para probar el modelo.


In [52]:
# ===============================================
# Configuración básica
# ==============================================

!pip install -q pandas numpy scikit-learn shap xgboost joblib gradio matplotlib
import os, json, joblib, warnings
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False
import shap
print("SHAP version:", shap.__version__)
PLOTS_DIR = "TFM_plots"; ARTIFACTS_DIR = "artifacts"
os.makedirs(PLOTS_DIR, exist_ok=True); os.makedirs(ARTIFACTS_DIR, exist_ok=True)

SHAP version: 0.48.0


In [53]:
# ===============================================
# CARGA DEL DATASET 
# ===============================================
DATA_PATH = "DATA/salaries.csv"
df = pd.read_csv(DATA_PATH)

# Detecta columna objetivo
if 'salary_eur' in df.columns: 
    target_col = 'salary_eur'
elif 'salary_in_usd' in df.columns: 
    target_col = 'salary_in_usd'
elif 'salary' in df.columns: 
    target_col = 'salary'
else:
    raise ValueError("No encuentro columna de salario.")

# Limpia columnas irrelevantes
df = df.drop(columns=[c for c in ['salary_currency','Unnamed: 0','index'] if c in df.columns], errors='ignore')

# Quita del dataset toda columna salarial que NO sea el objetivo
leak_cols = {'salary', 'salary_in_usd', 'salary_in_euro', 'salary_eur'}
leak_cols.discard(target_col)  # deja la del target
cols_a_quitar = [c for c in leak_cols if c in df.columns]
if cols_a_quitar:
    print("Quitando columnas que fugarían el target:", cols_a_quitar)
    df = df.drop(columns=cols_a_quitar)

# Construye X, y
y = df[target_col].astype(float)
X = df.drop(columns=[target_col])

# Identifica tipos
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['number','bool']).columns.tolist()
print("Target:", target_col, "| Categóricas:", len(cat_cols), "| Numéricas:", len(num_cols))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Quitando columnas que fugarían el target: ['salary']
Target: salary_in_usd | Categóricas: 6 | Numéricas: 2


In [54]:
# ===============================================
# Preprocesado
# ===============================================
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols)
    ], remainder='drop'
)

In [55]:
# ===============================================
# ENTRENAMIENTO Y EVALUACIÓN 
# ===============================================
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Intentar usar HistGradientBoosting (rápido). Si no existe, caer a GradientBoosting.
try:
    from sklearn.ensemble import HistGradientBoostingRegressor
    HAS_HGB = True
except ImportError:
    from sklearn.ensemble import GradientBoostingRegressor
    HAS_HGB = False

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5  # compat sin squared=

# --------- Modo rápido ---------
FAST = True          
MAX_FILAS = 15000

if FAST and len(X_train) > MAX_FILAS:
    X_train_ = X_train.sample(MAX_FILAS, random_state=42)
    y_train_ = y_train.loc[X_train_.index]
else:
    X_train_, y_train_ = X_train, y_train

models, results = {}, []

# 1) RandomForest LIGERO (sin paralelización: n_jobs=1 para evitar error de loky/wmic)
rf = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=150,   # antes 400
        max_depth=12,       # limita profundidad
        n_jobs=1,           # <-- clave: evita loky y el error de wmic
        random_state=42
    ))
])
rf.fit(X_train_, y_train_)
pred_rf = rf.predict(X_test)
results.append({
    "modelo": "RandomForest_fast",
    "MAE": mean_absolute_error(y_test, pred_rf),
    "RMSE": rmse(y_test, pred_rf),
    "R2": r2_score(y_test, pred_rf)
})
models["RandomForest_fast"] = rf

# 2) Boosting rápido (no usa n_jobs)
if HAS_HGB:
    gbr = Pipeline(steps=[
        ('prep', preprocessor),
        ('model', HistGradientBoostingRegressor(
            max_depth=8,
            max_iter=300,
            learning_rate=0.06,
            random_state=42
        ))
    ])
else:
    from sklearn.ensemble import GradientBoostingReg



In [56]:
# Mejor modelo
res_df = pd.DataFrame(results).sort_values("R2", ascending=False)
best_name = res_df.iloc[0]['modelo']; best_model = models[best_name]
print("Mejor modelo:", best_name); res_df

Mejor modelo: RandomForest_fast


Unnamed: 0,modelo,MAE,RMSE,R2
0,RandomForest_fast,47000.190348,64859.54485,0.25827


In [57]:
# ===============================================
# SHAP
# ===============================================

sample_idx = np.random.RandomState(42).choice(len(X_test), size=min(1000, len(X_test)), replace=False)
X_sample = X_test.iloc[sample_idx]
X_sample_prep = best_model.named_steps['prep'].transform(X_sample)
model_step = best_model.named_steps['model']
is_tree_model = hasattr(model_step, "predict") and any(kw in model_step.__class__.__name__.lower() for kw in ["forest","xgb","boost","tree"])
if is_tree_model:
    explainer = shap.TreeExplainer(model_step); shap_values = explainer.shap_values(X_sample_prep)
else:
    bg_idx = np.random.RandomState(42).choice(X_sample_prep.shape[0], size=min(100, X_sample_prep.shape[0]), replace=False)
    explainer = shap.KernelExplainer(model_step.predict, X_sample_prep[bg_idx])
    shap_values = explainer.shap_values(X_sample_prep[:200])
cat_names = best_model.named_steps['prep'].named_transformers_['cat'].get_feature_names_out(cat_cols).tolist() if len(cat_cols)>0 else []
feat_names = cat_names + num_cols
print("Nº features:", len(feat_names))

Nº features: 402


In [58]:
# ===============================================
# SHAP summary + bar 
# ===============================================
plt.figure(figsize=(10,6)); shap.summary_plot(shap_values, X_sample_prep, feature_names=feat_names, show=False)
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR, "shap_summary.png"), dpi=150, bbox_inches='tight'); plt.close()
plt.figure(figsize=(10,6)); shap.summary_plot(shap_values, X_sample_prep, feature_names=feat_names, plot_type="bar", show=False)
plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR, "shap_bar.png"), dpi=150, bbox_inches='tight'); plt.close()
print("Guardados: shap_summary.png, shap_bar.png")

Guardados: shap_summary.png, shap_bar.png


In [59]:
# ===============================================
# SHAP dependence (top 3) 
# ===============================================
importances = np.mean(np.abs(shap_values), axis=0)
top_idx = np.argsort(importances)[::-1][:3]
top_feats = [feat_names[i] for i in top_idx]
for f in top_feats:
    plt.figure(figsize=(8,5)); shap.dependence_plot(f, shap_values, X_sample_prep, feature_names=feat_names, show=False)
    plt.tight_layout(); out = os.path.join(PLOTS_DIR, f"shap_dependence_{str(f).replace('/', '_').replace(' ', '_')}.png")
    plt.savefig(out, dpi=150, bbox_inches='tight'); plt.close(); print("Guardado:", out)

Guardado: TFM_plots\shap_dependence_employee_residence_US.png
Guardado: TFM_plots\shap_dependence_experience_level_MI.png
Guardado: TFM_plots\shap_dependence_experience_level_EN.png


<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

<Figure size 800x500 with 0 Axes>

In [60]:
# ===============================================
# PDP de las top 3 (versión rápida) 
# ===============================================


import numpy as np, os, matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay

# 1) Usa solo el modelo interno y pretransforma una vez
prep = best_model.named_steps['prep']
model = best_model.named_steps['model']

X_train_prep = prep.transform(X_train)

# 2) Mapea nombres -> índices en el espacio transformado
feat_to_idx = {name: i for i, name in enumerate(feat_names)}
idxs = [feat_to_idx[f] for f in top_feats if f in feat_to_idx]

if len(idxs) == 0:
    raise ValueError("Ninguna de las top_feats aparece en feat_names. Recalcula top_feats tras SHAP.")

# 3) Submuestrea filas para acelerar
rng = np.random.RandomState(42)
sub = min(3000, X_train_prep.shape[0])  # ajusta a 1000 si aún va lento
sub_idx = rng.choice(X_train_prep.shape[0], size=sub, replace=False)
X_sub = X_train_prep[sub_idx]

# 4) PDP con menos puntos de rejilla
fig, axes = plt.subplots(nrows=len(idxs), ncols=1, figsize=(9, 4*len(idxs)))
if len(idxs) == 1:
    axes = [axes]

for ax, f_idx in zip(axes, idxs):
    PartialDependenceDisplay.from_estimator(
        model,
        X_sub,
        [f_idx],               # índice de la feature transformada
        ax=ax,
        kind='average',
        grid_resolution=20     # (por defecto suele ser 100). Baja = más rápido
        # No pasamos n_jobs para evitar problemas de loky/wmic en Windows
    )
    ax.set_title(f"PDP: {feat_names[f_idx]}")

plt.tight_layout()
out = os.path.join(PLOTS_DIR, "pdp_top3_fast.png")
plt.savefig(out, dpi=150, bbox_inches='tight')
plt.close()
print("Guardado:", out)


Guardado: TFM_plots\pdp_top3_fast.png


In [61]:
# ===============================================
# Función de predicción y guardado de artefactos 
# ===============================================

# Normalización mínima para que coincida con lo que vio el modelo
_EMPLOYMENT_MAP = {
    "FULL-TIME": "FT", "FULL TIME": "FT",
    "PART-TIME": "PT", "PART TIME": "PT",
    "CONTRACT": "CT", "FREELANCE": "FL"
}

def _norm_row(row: dict, expected_features):
    r = {k: row.get(k, np.nan) for k in expected_features}
    # mayúsculas en categóricas principales
    for c in ["experience_level", "employment_type", "employee_residence",
              "company_location", "company_size"]:
        if r.get(c) is not None:
            r[c] = str(r[c]).strip().upper()
    # mapear empleo legible -> código del dataset
    if r.get("employment_type") in _EMPLOYMENT_MAP:
        r["employment_type"] = _EMPLOYMENT_MAP[r["employment_type"]]
    # asegurar remote_ratio numérico en [0, 100]
    if "remote_ratio" in r and r["remote_ratio"] is not None:
        try:
            r["remote_ratio"] = float(r["remote_ratio"])
        except Exception:
            r["remote_ratio"] = 0.0
        r["remote_ratio"] = max(0.0, min(100.0, r["remote_ratio"]))
    return r

EXPECTED_FEATURES = X.columns.tolist()
def predict_salary(input_dict, pipeline=best_model, expected_features=EXPECTED_FEATURES):
    row = {k: input_dict.get(k, np.nan) for k in expected_features}
    row = _norm_row(row, expected_features)   # <<< AÑADIDO
    df_in = pd.DataFrame([row], columns=expected_features)
    return float(pipeline.predict(df_in)[0])

# Demo
example = {}
for c in EXPECTED_FEATURES:
    if str(X_train[c].dtype) == 'object' and X_train[c].dropna().shape[0] > 0:
        example[c] = X_train[c].dropna().astype(str).mode().iloc[0]
    else:
        example[c] = float(np.nanmedian(X_train[c])) if c in X_train.columns else 0.0
print("Predicción ejemplo:", predict_salary(example))
joblib.dump(best_model, os.path.join(ARTIFACTS_DIR, "salary_pipeline.joblib"))
with open(os.path.join(ARTIFACTS_DIR, "meta.json"), "w") as f:
    json.dump({"best_model": best_name, "features": EXPECTED_FEATURES, "target": target_col}, f, indent=2)
print("Artefactos guardados en 'artifacts/'")

Predicción ejemplo: 174530.9471381858
Artefactos guardados en 'artifacts/'


In [82]:
# ===============================================
# DEMO GRADIO 
# ===============================================

import gradio as gr, numpy as np, pandas as pd, sys, subprocess, json
from pathlib import Path

# -------------------------------------------------
# 0) pycountry lista completa de países
# -------------------------------------------------
def _get_pycountry():
    try:
        import pycountry
        return pycountry
    except Exception:
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pycountry"], check=False)
            import pycountry
            return pycountry
        except Exception:
            return None

pycountry = _get_pycountry()

# -------------------------------------------------
# 1) Etiquetas amigables y mapeos
# -------------------------------------------------
EMPLOYMENT_TO_LABEL = {"FT": "Full-time", "PT": "Part-time", "CT": "Contract", "FL": "Freelance"}
LABEL_TO_EMPLOYMENT = {v: k for k, v in EMPLOYMENT_TO_LABEL.items()}

SIZE_TO_LABEL  = {"S": "Small", "M": "Medium", "L": "Large"}
LABEL_TO_SIZE  = {v: k for k, v in SIZE_TO_LABEL.items()}

# Experiencia: etiquetas legibles <-> códigos del dataset
EXP_CODE_TO_LABEL = {"EN": "Entry (EN)", "MI": "Mid (MI)", "SE": "Senior (SE)", "EX": "Executive (EX)"}
EXP_LABEL_TO_CODE = {v: k for k, v in EXP_CODE_TO_LABEL.items()}

# Fallback de países si no hay pycountry 
COUNTRY_FALLBACK = {
 "US":"United States","GB":"United Kingdom","ES":"Spain","FR":"France","DE":"Germany",
 "IT":"Italy","PT":"Portugal","NL":"Netherlands","BE":"Belgium","IE":"Ireland",
 "CA":"Canada","AU":"Australia","NZ":"New Zealand","MX":"Mexico","BR":"Brazil",
 "AR":"Argentina","CL":"Chile","CO":"Colombia","PE":"Peru","PH":"Philippines",
 "PL":"Poland","CZ":"Czechia","SE":"Sweden","NO":"Norway","FI":"Finland",
 "DK":"Denmark","CH":"Switzerland","AT":"Austria","RO":"Romania","HU":"Hungary",
 "GR":"Greece","TR":"Turkey","IL":"Israel","AE":"United Arab Emirates","SA":"Saudi Arabia",
 "IN":"India","SG":"Singapore","JP":"Japan","KR":"South Korea","CN":"China","ZA":"South Africa",
 "RU":"Russia","UA":"Ukraine","XK":"Kosovo"
}

def all_countries():
    pairs = []
    if pycountry:
        for c in pycountry.countries:
            try:
                pairs.append((c.alpha_2.upper(), c.name))
            except Exception:
                pass
        if not any(code == "XK" for code, _ in pairs):
            pairs.append(("XK", "Kosovo"))
    else:
        pairs = list(COUNTRY_FALLBACK.items())
        for col in ("employee_residence", "company_location"):
            if col in X_train.columns:
                for code in map(str, X_train[col].dropna().unique()):
                    code = code.upper().strip()
                    if not any(code == c for c, _ in pairs):
                        pairs.append((code, code))
    return sorted(pairs, key=lambda t: t[1])

_country_pairs = all_countries()
COUNTRY_LABELS = [name for _, name in _country_pairs]
COUNTRY_L2C = {name: code for code, name in _country_pairs}

# Quitar 'salary' de la interfaz
if "salary" in EXPECTED_FEATURES:
    EXPECTED_FEATURES = [c for c in EXPECTED_FEATURES if c != "salary"]

# -------------------------------------------------
# 2) Helper formato €
# -------------------------------------------------
EUR_SYMBOL = "€"
def format_eur(x):
    try:
        x = float(x)
        s = f"{x:,.2f}".replace(",", "X").replace(".", ",").replace("X", ".")
        return f"{s} {EUR_SYMBOL}"
    except Exception:
        return "—"

# -------------------------------------------------
# 3) ¿El modelo espera remote_cat o remote_ratio?
# -------------------------------------------------
def _modelo_espera_remote_cat():
    try:
        meta_path = Path("artifacts") / "preproc_meta.json"
        if meta_path.exists():
            meta = json.loads(meta_path.read_text(encoding="utf-8"))
            cats = set(meta.get("categorical_columns", []))
            return "remote_cat" in cats
    except Exception:
        pass
    # Si no hay meta: miramos las features
    return "remote_cat" in EXPECTED_FEATURES

USES_REMOTE_CAT = _modelo_espera_remote_cat()

# -------------------------------------------------
# 4) Función de inferencia (kwargs) + mapeos robustos
# -------------------------------------------------
def _predict_interface(**kwargs):
    # 1) Recoge solo lo que el modelo espera
    data = {k: kwargs.get(k) for k in EXPECTED_FEATURES}

    # 2) Mapeos legibles -> códigos del modelo
    if "employment_type" in data and data["employment_type"]:
        data["employment_type"] = LABEL_TO_EMPLOYMENT.get(str(data["employment_type"]).strip(), data["employment_type"])

    if "company_size" in data and data["company_size"]:
        data["company_size"] = LABEL_TO_SIZE.get(str(data["company_size"]).strip(), data["company_size"])

    if "employee_residence" in data and data["employee_residence"]:
        data["employee_residence"] = COUNTRY_L2C.get(str(data["employee_residence"]).strip(), data["employee_residence"])

    if "company_location" in data and data["company_location"]:
        data["company_location"] = COUNTRY_L2C.get(str(data["company_location"]).strip(), data["company_location"])

    # --- NUEVO: asegurar que job_title coincide con el training (evita que no cambie la predicción) ---
    if "job_title" in data and data["job_title"]:
        jt = str(data["job_title"]).strip()
        if "job_title" in X_train.columns:
            choices = X_train["job_title"].dropna().astype(str).unique().tolist()
            if jt not in choices:
                by_cf = {c.casefold(): c for c in choices}
                data["job_title"] = by_cf.get(jt.casefold(), choices[0])

    if "experience_level" in data and data["experience_level"]:
        val = str(data["experience_level"]).strip()
        # etiqueta legible -> código (EN/MI/SE/EX)
        data["experience_level"] = EXP_LABEL_TO_CODE.get(val, val.upper())

    # 3) Remote: crear/validar según lo que use el modelo
    if USES_REMOTE_CAT:
        # crear remote_cat a partir de remote_ratio si
        if "remote_cat" not in data:
            rr = kwargs.get("remote_ratio", None)
            try:
                r = float(rr)
            except Exception:
                r = None
            if r is None:
                rc = "Híbrido (1–99%)"
            elif r == 0:
                rc = "Presencial (0%)"
            elif r == 100:
                rc = "Remoto (100%)"
            else:
                rc = "Híbrido (1–99%)"
            data["remote_cat"] = rc
        # si el modelo no espera remote_ratio, lo quitamos del payload
        if "remote_ratio" in data and "remote_ratio" not in EXPECTED_FEATURES:
            data.pop("remote_ratio", None)
    else:
        # el modelo espera remote_ratio numérico en [0,100]
        if "remote_ratio" in data:
            try:
                r = float(data["remote_ratio"])
                data["remote_ratio"] = min(100, max(0, r))
            except Exception:
                data["remote_ratio"] = 0
        # quitamos remote_cat si no se usa
        data.pop("remote_cat", None)

    # 4) Predicción con tu pipeline persistido
    out = predict_salary(data)

    # 5) Extraer número y formatear a €
    try:
        val = None
        if hasattr(out, "columns"):
            for col in ["pred_salary_eur", "pred_salary", "pred_salary_usd"]:
                if col in out.columns:
                    val = float(out[col].iloc[0])
                    if col == "pred_salary_usd":
                        val *= 0.92  # conversión simple si solo devuelves USD
                    break
        if val is None:
            val = float(out)
        return format_eur(val)
    except Exception as e:
        print("Error formateando predicción:", e)
        return "—"


# -------------------------------------------------
# 5) Construcción de inputs
# -------------------------------------------------
def _default_or_none(seq, prefer=None):
    if prefer and prefer in seq: return prefer
    return seq[0] if seq else None

inputs = []
for c in EXPECTED_FEATURES:
    label = c.replace("_", " ").title()

    if c == "employment_type":
        labels = list(EMPLOYMENT_TO_LABEL.values())
        inputs.append(gr.Dropdown(choices=labels, label=label,
                                  value=_default_or_none(labels, "Full-time"),
                                  allow_custom_value=False))

    elif c == "experience_level":
        exp_labels = [EXP_CODE_TO_LABEL.get("EN","EN"), EXP_CODE_TO_LABEL.get("MI","MI"),
                      EXP_CODE_TO_LABEL.get("SE","SE"), EXP_CODE_TO_LABEL.get("EX","EX")]
        inputs.append(gr.Dropdown(choices=exp_labels, label="Experience Level",
                                  value=_default_or_none(exp_labels, "Mid (MI)"),
                                  allow_custom_value=False))

    elif c in ("employee_residence", "company_location"):
        inputs.append(gr.Dropdown(choices=COUNTRY_LABELS, label=label,
                                  value=_default_or_none(COUNTRY_LABELS, "Spain"),
                                  allow_custom_value=False))

    elif c == "company_size":
        labels = list(SIZE_TO_LABEL.values())
        inputs.append(gr.Dropdown(choices=labels, label=label,
                                  value=_default_or_none(labels, "Medium"),
                                  allow_custom_value=False))

    elif c == "remote_cat":
        rc_labels = ["Presencial (0%)", "Híbrido (1–99%)", "Remoto (100%)"]
        inputs.append(gr.Dropdown(choices=rc_labels, label="Remote Mode",
                                  value=_default_or_none(rc_labels, "Remoto (100%)"),
                                  allow_custom_value=False))

    elif c == "remote_ratio":
        
        default_val = 100
        inputs.append(gr.Number(label=label, value=default_val))

    elif str(X_train.get(c, pd.Series(dtype=object)).dtype) == "object":
        vals = sorted(list(map(str, X_train[c].dropna().unique())))[:100] if c in X_train.columns else []
        if not vals: vals = ["N/A"]
        inputs.append(gr.Dropdown(choices=vals, label=label, value=vals[0]))

    else:
        default_val = float(np.nanmedian(X_train[c])) if c in X_train.columns else 0.0
        inputs.append(gr.Number(label=label, value=default_val))

# -------------------------------------------------
# 6) Interface: mapear posicionales 
# -------------------------------------------------
# --- construir inputs EXACTAMENTE en el orden de EXPECTED_FEATURES ---
def _make_component_for(feat):
    import gradio as gr
    # Ajusta componentes por nombre de feature (usa tus mapeos existentes)
    if feat == "employment_type":
        return gr.Dropdown(choices=list(EMPLOYMENT_TO_LABEL.values()),
                           value=list(EMPLOYMENT_TO_LABEL.values())[0],
                           label="employment_type")
    if feat == "company_size":
        return gr.Dropdown(choices=list(SIZE_TO_LABEL.values()),
                           value=list(SIZE_TO_LABEL.values())[0],
                           label="company_size")
    if feat in ("employee_residence", "company_location"):
        # COUNTRY_LABELS ya lo tienes creado con pycountry/fallback
        default_lbl = COUNTRY_LABELS[0] if COUNTRY_LABELS else "Spain"
        return gr.Dropdown(choices=COUNTRY_LABELS, value=default_lbl, label=feat)
    if feat == "experience_level":
        return gr.Dropdown(choices=list(EXP_CODE_TO_LABEL.values()),
                           value=list(EXP_CODE_TO_LABEL.values())[1],
                           label="experience_level")
    if feat == "remote_ratio":
        return gr.Slider(minimum=0, maximum=100, step=1, value=100, label="remote_ratio")
    # Heurística: categórica vs numérica
    if feat in X_train.columns and X_train[feat].dtype == "object":
        ch = sorted(X_train[feat].dropna().astype(str).unique().tolist())
        val = ch[0] if ch else ""
        return gr.Dropdown(choices=ch[:2000], value=val, allow_custom_value=True, label=feat)
    elif feat in X_train.columns:
        return gr.Number(value=float(np.nanmedian(pd.to_numeric(X_train[feat], errors="coerce"))),
                         label=feat)
    else:
        return gr.Textbox(value="", label=feat)

inputs = [_make_component_for(f) for f in EXPECTED_FEATURES]

# --- wrapper que mapea valores 
def _call(*vals):
    data = dict(zip(EXPECTED_FEATURES, vals))
    return _predict_interface(**data)

# CSS: más ancho y menos espacio vertical
CSS = """
.gradio-container { max-width: 1600px !important; margin: 0 auto !important; }
.gradio-container .gr-row { gap: 10px !important; }
.gradio-container .gr-column { gap: 10px !important; }
.gr-textbox input, .gr-number input { height: 48px; font-size: 1.05rem; }
"""

with gr.Blocks(title="Estimador de Salario (TFM)", css=CSS) as demo:
    gr.Markdown("## Estimador de Salario (TFM)\nIntroduce el perfil; internamente se mapean etiquetas a los códigos del modelo.")

    with gr.Row():
        # ---------- Inputs en 3 columnas ----------
        with gr.Column(scale=3):
            inputs_grid = []
            with gr.Row():
                col1 = gr.Column()
                col2 = gr.Column()
                col3 = gr.Column()
            cols = [col1, col2, col3]
            for i, feat in enumerate(EXPECTED_FEATURES):
                with cols[i % 3]:
                    inputs_grid.append(_make_component_for(feat))

        # ---------- Salida a la derecha ----------
        with gr.Column(scale=2):
            out_box = gr.Textbox(label="Predicción salario (€)")
            btn = gr.Button("Predecir", variant="primary")
            btn.click(
                lambda *vals: _predict_interface(**dict(zip(EXPECTED_FEATURES, vals))),
                inputs=inputs_grid,
                outputs=out_box
            )


demo.launch()



* Running on local URL:  http://127.0.0.1:7871
* To create a public link, set `share=True` in `launch()`.




In [67]:
# Exportar EL .ipynb a HTML legible (sin código)
from pathlib import Path
import nbformat
from nbconvert import HTMLExporter

NB_NAME = "TFM_Demo.ipynb"   
nb_path = Path(NB_NAME).resolve()
assert nb_path.exists(), f"No existe: {nb_path}"

nb = nbformat.read(nb_path, as_version=4)

exp = HTMLExporter()
exp.exclude_input = True
exp.exclude_output_prompt = True
exp.exclude_input_prompt = True
exp.embed_images = True

body, _ = exp.from_notebook_node(nb)

out_dir = Path("exports"); out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / f"{nb_path.stem}_export.html"
out_file.write_text(body, encoding="utf-8")
print("OK ->", out_file.resolve())


OK -> C:\Users\x12ms\TFM\exports\TFM_Demo_export.html
