In [1]:
import pandas as pd 
import os
os.chdir("../")

In [2]:
df = pd.read_csv(r"data\tech_salaries_filtered_no_others.csv")

In [3]:
df.cantidad_de_personas_en_tu_organizacion.unique()

array(['De 201 a 500 personas', 'De 1001 a 2000 personas',
       'De 2 a 10 personas', 'Más de 10000 personas',
       'De 51 a 100 personas', 'De 2001a 5000 personas',
       'De 11  a 50  personas', 'De 101 a 200 personas',
       'De 5001 a 10000 personas', 'De 501 a 1000 personas',
       '1 (solamente yo)'], dtype=object)

In [4]:
df['contrato'] = df['contrato'].replace({
    'Tercerizado (trabajo a través de consultora o agencia)': 'Contractor',
    'Freelance': 'Contractor',
    "Participación societaria en una cooperativa": 'Contractor',
})

In [5]:
df["seniority"].unique()

array(['Semi-Senior', 'Senior', 'Junior', 'Manager or Above'],
      dtype=object)

In [6]:
df.head().to_dict()

{'dedicacion': {0: 'Full-Time',
  1: 'Part-Time',
  2: 'Full-Time',
  3: 'Full-Time',
  4: 'Full-Time'},
 'contrato': {0: 'Staff (planta permanente)',
  1: 'Staff (planta permanente)',
  2: 'Staff (planta permanente)',
  3: 'Contractor',
  4: 'Staff (planta permanente)'},
 'salario': {0: 3952805.0,
  1: 1606000.0,
  2: 4000000.0,
  3: 3000000.0,
  4: 3953142.06},
 'anos_de_experiencia': {0: 3, 1: 5, 2: 25, 3: 18, 4: 15},
 'antiguedad_en_la_empresa_actual': {0: 3, 1: 2, 2: 3, 3: 8, 4: 5},
 'anos_en_el_puesto_actual': {0: 1, 1: 2, 2: 3, 3: 6, 4: 4},
 'cuantas_personas_tenes_a_cargo': {0: 2, 1: 0, 2: 5, 3: 0, 4: 3},
 'cantidad_de_personas_en_tu_organizacion': {0: 'De 201 a 500 personas',
  1: 'De 1001 a 2000 personas',
  2: 'De 2 a 10 personas',
  3: 'De 2 a 10 personas',
  4: 'De 201 a 500 personas'},
 'modalidad_de_trabajo': {0: '100% remoto',
  1: '100% remoto',
  2: '100% remoto',
  3: '100% remoto',
  4: '100% remoto'},
 'edad': {0: 29, 1: 25, 2: 50, 3: 41, 4: 47},
 'seniority': {0: 

In [93]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
import numpy as np
from typing_extensions import Self
from typing import Any
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, MultiTaskElasticNetCV, LassoLars, BayesianRidge, ARDRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score  #

In [94]:
Array = pd.DataFrame | pd.Series | np.ndarray

In [95]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns: list[str]):
        self.columns = columns
        
    def fit(self, X: pd.DataFrame, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        return X[self.columns].values


In [96]:
class SeniorityTransformer(BaseEstimator, TransformerMixin):
    seniority_mapping: dict[str, int] = {
        'Junior': 1, 
        'Semi-Senior': 2, 
        'Senior': 3, 
        'Manager or Above': 4
    }
    
    def fit(self, X: Array, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: Array) -> np.ndarray:
        if isinstance(X, pd.DataFrame):
            values = X.iloc[:, 0].values
        else:
            values = X.flatten()
        
        result = np.array([self.seniority_mapping.get(x, 0) for x in values])
        return result.reshape(-1, 1)

In [97]:
class OrganizationSizeTransformer(BaseEstimator, TransformerMixin):
    size_mapping = {
        '1 (solamente yo)': 1,
        'De 2 a 10 personas': 6,  # promedio de 2 y 10
        'De 11 a 50 personas': 30,
        'De 51 a 100 personas': 75,
        'De 101 a 200 personas': 150,
        'De 201 a 500 personas': 350,
        'De 501 a 1000 personas': 750,
        'De 1001 a 2000 personas': 1500,
        'De 2001a 5000 personas': 3500,
        'De 5001 a 10000 personas': 7500,
        'Más de 10000 personas': 15000
    }
    
    def fit(self, X: Array, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: Array) -> np.ndarray:
        if isinstance(X, pd.DataFrame):
            values = X.iloc[:, 0].values
        else:
            values = X.flatten()
        
        result = np.array([self.size_mapping.get(x, 0) for x in values])
        return result.reshape(-1, 1) 

In [98]:
df.columns

Index(['dedicacion', 'contrato', 'salario', 'anos_de_experiencia',
       'antiguedad_en_la_empresa_actual', 'anos_en_el_puesto_actual',
       'cuantas_personas_tenes_a_cargo',
       'cantidad_de_personas_en_tu_organizacion', 'modalidad_de_trabajo',
       'edad', 'seniority', 'marvin_rol'],
      dtype='object')

In [99]:
class SalaryPredictionPipeline:
    """Pipeline completa para la predicción de salarios."""
    
    def __init__(self, model: Any | None = None):
        self.model = model
        self.pipeline: Pipeline | None = None
        self.categorical_columns: list[str] = [
            'dedicacion', 'contrato', 'cantidad_de_personas_en_tu_organizacion',
            'modalidad_de_trabajo', 'seniority', 'marvin_rol'
        ]
        self.numerical_columns: list[str] = [
            'anos_de_experiencia', 'antiguedad_en_la_empresa_actual',
            'anos_en_el_puesto_actual', 'cuantas_personas_tenes_a_cargo', 'edad'
        ]
        self.target_column: str = 'salario'
        
    def build_pipeline(self):
        # Transformadores modificados para garantizar consistencia
        numerical_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.numerical_columns)),
            ('scaler', StandardScaler())
        ])
        
        categorical_basic_cols = ['dedicacion', 'contrato', 'modalidad_de_trabajo']
        categorical_basic_pipeline = Pipeline([
            ('selector', DataFrameSelector(categorical_basic_cols)),
            ('encoder', OneHotEncoder(sparse_output=False))
        ])
        
        seniority_pipeline = Pipeline([
            ('selector', DataFrameSelector(['seniority'])),
            ('transformer', SeniorityTransformer())
        ])
        
        org_size_pipeline = Pipeline([
            ('selector', DataFrameSelector(['cantidad_de_personas_en_tu_organizacion'])),
            ('transformer', OrganizationSizeTransformer())
        ])
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('numerical', numerical_pipeline, self.numerical_columns),
                ('categorical_basic', categorical_basic_pipeline, categorical_basic_cols),
                ('seniority', seniority_pipeline, ['seniority']),
                ('org_size', org_size_pipeline, ['cantidad_de_personas_en_tu_organizacion'])
            ],
            remainder='drop'
        )
        
        if self.model:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('model', self.model)
            ])
        else:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor)
            ])
            
        return self.pipeline
    
    def fit(self, X: pd.DataFrame, y: Array) -> Self:
        if self.pipeline is None:
            self.build_pipeline()
        
        if X.isna().any().any():
            print(f"Advertencia: El dataset tiene valores NaN. La pipeline intentará manejarlos.")
            
        self.pipeline.fit(X, y)
        return self
    
    def transform(self, X: pd.DataFrame) -> np.ndarray:
        if self.pipeline is None:
            raise ValueError("La pipeline no ha sido construida o entrenada")
            
        return self.pipeline.transform(X)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Realiza predicciones utilizando el modelo entrenado."""
        if self.pipeline is None or self.model is None:
            raise ValueError("La pipeline completa con modelo no está disponible")
            
        return self.pipeline.predict(X)

In [100]:
X = df.drop('salario', axis=1)
y = df['salario']

pipeline = SalaryPredictionPipeline()
transformed_data = pipeline.build_pipeline().fit_transform(X)

In [101]:
y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)


In [102]:
regressor = LinearRegression()

In [103]:
def evaluate_model(model: Any, X_test: pd.DataFrame, y_test: pd.Series) -> dict[str, float]:
    y_pred = model.predict(X_test)
    
    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)
    
    metrics = {
        'r2': r2_score(y_test, y_pred),
        'mse': mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'mae': mean_absolute_error(y_test, y_pred),
        'r2_original': r2_score(y_test_orig, y_pred_orig),
        'rmse_original': np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)),
        'mae_original': mean_absolute_error(y_test_orig, y_pred_orig)
    }
    
    print("Métricas de evaluación del modelo:")
    print(f"R² (escala log): {metrics['r2']:.4f}")
    print(f"RMSE (escala log): {metrics['rmse']:.4f}")
    print(f"MAE (escala log): {metrics['mae']:.4f}")
    print(f"R² (escala original): {metrics['r2_original']:.4f}")
    print(f"RMSE (escala original): {metrics['rmse_original']:.2f}")
    print(f"MAE (escala original): {metrics['mae_original']:.2f}")
    
    return metrics

In [104]:
regressor = LinearRegression()


def get_metrics_by_model(model: Any) -> None:
    pipeline = SalaryPredictionPipeline(model=model)
    model_pipeline = pipeline.build_pipeline()

    model_pipeline_fitted = model_pipeline.fit(X_train, y_train)

    y_pred = model_pipeline_fitted.predict(X_test)

    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)

    metrics = {
        'r2': r2_score(y_test, y_pred),
        'mse': mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'mae': mean_absolute_error(y_test, y_pred),
        'r2_original': r2_score(y_test_orig, y_pred_orig),
        'rmse_original': np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)),
        'mae_original': mean_absolute_error(y_test_orig, y_pred_orig)
    }

    print("Métricas de evaluación del modelo:")
    print(f"R² (escala log): {metrics['r2']:.4f}")
    print(f"RMSE (escala log): {metrics['rmse']:.4f}")
    print(f"MAE (escala log): {metrics['mae']:.4f}")
    print(f"R² (escala original): {metrics['r2_original']:.4f}")
    print(f"RMSE (escala original): {metrics['rmse_original']:.2f}")
    print(f"MAE (escala original): {metrics['mae_original']:.2f}")

### Modelos Lineales

In [105]:
get_metrics_by_model(regressor)

Métricas de evaluación del modelo:
R² (escala log): 0.2802
RMSE (escala log): 0.4156
MAE (escala log): 0.3353
R² (escala original): 0.2152
RMSE (escala original): 1101893.49
MAE (escala original): 825785.65




In [106]:
ridge = RidgeCV()
get_metrics_by_model(ridge)

Métricas de evaluación del modelo:
R² (escala log): 0.2810
RMSE (escala log): 0.4153
MAE (escala log): 0.3354
R² (escala original): 0.2154
RMSE (escala original): 1101734.79
MAE (escala original): 825851.49




In [107]:
lasso = LassoCV()
get_metrics_by_model(lasso)



Métricas de evaluación del modelo:
R² (escala log): -0.0009
RMSE (escala log): 0.4900
MAE (escala log): 0.4053
R² (escala original): -0.0425
RMSE (escala original): 1269965.48
MAE (escala original): 977625.16


In [108]:
elastic_net = ElasticNetCV()
get_metrics_by_model(elastic_net)

Métricas de evaluación del modelo:
R² (escala log): -0.0009
RMSE (escala log): 0.4900
MAE (escala log): 0.4053
R² (escala original): -0.0425
RMSE (escala original): 1269965.48
MAE (escala original): 977625.16




In [109]:
lasso_lars = LassoLars()
get_metrics_by_model(lasso_lars)



Métricas de evaluación del modelo:
R² (escala log): -0.0009
RMSE (escala log): 0.4900
MAE (escala log): 0.4053
R² (escala original): -0.0425
RMSE (escala original): 1269978.57
MAE (escala original): 977633.04


In [110]:
lasso_lars = BayesianRidge()
get_metrics_by_model(lasso_lars)

Métricas de evaluación del modelo:
R² (escala log): 0.2809
RMSE (escala log): 0.4154
MAE (escala log): 0.3353
R² (escala original): 0.2154
RMSE (escala original): 1101751.92
MAE (escala original): 825832.67




In [111]:
adr = ARDRegression()
get_metrics_by_model(adr)



Métricas de evaluación del modelo:
R² (escala log): 0.2813
RMSE (escala log): 0.4153
MAE (escala log): 0.3357
R² (escala original): 0.2152
RMSE (escala original): 1101870.24
MAE (escala original): 826552.92


### Modelos no lineales (y pipeline no lineal)

In [112]:
class SalaryPredictionPipeline:
    """Pipeline completa para la predicción de salarios."""
    
    def __init__(self, model: Any | None = None):
        self.model = model
        self.pipeline: Pipeline | None = None
        self.categorical_columns: list[str] = [
            'dedicacion', 'contrato', 'cantidad_de_personas_en_tu_organizacion',
            'modalidad_de_trabajo', 'seniority', 'marvin_rol'
        ]
        self.numerical_columns: list[str] = [
            'anos_de_experiencia', 'antiguedad_en_la_empresa_actual',
            'anos_en_el_puesto_actual', 'cuantas_personas_tenes_a_cargo', 'edad'
        ]
        self.target_column: str = 'salario'
        
    def build_pipeline(self):
        # Transformadores modificados para garantizar consistencia
        numerical_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.numerical_columns)),
            ('scaler', StandardScaler())
        ])
        
        categorical_basic_cols = ['dedicacion', 'contrato', 'modalidad_de_trabajo']
        categorical_basic_pipeline = Pipeline([
            ('selector', DataFrameSelector(categorical_basic_cols)),
            ('encoder', OneHotEncoder(sparse_output=False))
        ])
        
        seniority_pipeline = Pipeline([
            ('selector', DataFrameSelector(['seniority'])),
            ('transformer', SeniorityTransformer())
        ])
        
        org_size_pipeline = Pipeline([
            ('selector', DataFrameSelector(['cantidad_de_personas_en_tu_organizacion'])),
            ('transformer', OrganizationSizeTransformer())
        ])
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('numerical', numerical_pipeline, self.numerical_columns),
                ('categorical_basic', categorical_basic_pipeline, categorical_basic_cols),
                ('seniority', seniority_pipeline, ['seniority']),
                ('org_size', org_size_pipeline, ['cantidad_de_personas_en_tu_organizacion'])
            ],
            remainder='drop'
        )
        
        if self.model is not None:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('feature_selection', SelectFromModel(estimator=RandomForestRegressor())),
                ('model', self.model)
            ])
        else:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor)
            ])
            
        return self.pipeline
    
    def fit(self, X: pd.DataFrame, y: Array) -> Self:
        if self.pipeline is None:
            self.build_pipeline()
        
        if X.isna().any().any():
            print(f"Advertencia: El dataset tiene valores NaN. La pipeline intentará manejarlos.")
            
        self.pipeline.fit(X, y)
        return self
    
    def transform(self, X: pd.DataFrame) -> np.ndarray:
        if self.pipeline is None:
            raise ValueError("La pipeline no ha sido construida o entrenada")
            
        return self.pipeline.transform(X)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Realiza predicciones utilizando el modelo entrenado."""
        if self.pipeline is None or self.model is None:
            raise ValueError("La pipeline completa con modelo no está disponible")
            
        return self.pipeline.predict(X)

In [113]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [114]:
non_linear_models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=150, learning_rate=0.1),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=150),
    "SVM RBF": SVR(kernel='rbf', C=1.0, epsilon=0.1)
}

In [115]:
def get_metrics_by_model(model: Any, use_original_scale: bool = True) -> None:
    pipeline = SalaryPredictionPipeline(model=model)
    model_pipeline = pipeline.build_pipeline()
    
    # Validación cruzada para modelos no lineales
    cv_scores = cross_val_score(
        model_pipeline,
        X_train,
        y_train,
        cv=5,
        scoring='neg_root_mean_squared_error'
    )
    
    model_pipeline_fitted = model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline_fitted.predict(X_test)

    # Cálculo de métricas mejorado
    metrics = {
        'cv_rmse_mean': abs(cv_scores.mean()),
        'cv_rmse_std': cv_scores.std(),
        'r2': model_pipeline_fitted.score(X_test, y_test),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred))
    }
    
    if use_original_scale:
        y_test_orig = np.expm1(y_test)
        y_pred_orig = np.expm1(y_pred)
        metrics.update({
            'r2_original': r2_score(y_test_orig, y_pred_orig),
            'rmse_original': np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
        })
    
    print(f"\nMétricas para {model.__class__.__name__}:")
    print(f"CV RMSE: {metrics['cv_rmse_mean']:.4f} (±{metrics['cv_rmse_std']:.4f})")
    print(f"Test R²: {metrics['r2']:.4f}")
    print(f"Test RMSE: {metrics['rmse']:.4f}")
    if use_original_scale:
        print(f"Test R² (original): {metrics['r2_original']:.4f}")
        print(f"Test RMSE (original): {metrics['rmse_original']:.2f}")

In [116]:
for model_name, model in non_linear_models.items():
    print(f"\n{'='*40}")
    print(f"Evaluando {model_name}")
    get_metrics_by_model(model)
    print(f"{'='*40}")


Evaluando Random Forest





Métricas para RandomForestRegressor:
CV RMSE: 0.4186 (±0.0132)
Test R²: 0.2768
Test RMSE: 0.4165
Test R² (original): 0.2303
Test RMSE (original): 1091220.10

Evaluando Gradient Boosting





Métricas para GradientBoostingRegressor:
CV RMSE: 0.4104 (±0.0114)
Test R²: 0.2668
Test RMSE: 0.4194
Test R² (original): 0.2073
Test RMSE (original): 1107395.89

Evaluando XGBoost





Métricas para XGBRegressor:
CV RMSE: 0.4664 (±0.0196)
Test R²: 0.0410
Test RMSE: 0.4797
Test R² (original): -0.0259
Test RMSE (original): 1259801.75

Evaluando SVM RBF





Métricas para SVR:
CV RMSE: 0.4895 (±0.0081)
Test R²: -0.0062
Test RMSE: 0.4913
Test R² (original): -0.0480
Test RMSE (original): 1273319.18


In [119]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

# 1. Definir los espacios de búsqueda para cada modelo
param_distributions = {
    "Random Forest": {
        'model__n_estimators': randint(100, 500),
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': randint(2, 20),
        'model__max_features': ['sqrt', 'log2', None]
    },
    "Gradient Boosting": {
        'model__n_estimators': randint(100, 500),
        'model__learning_rate': loguniform(1e-3, 1e0),
        'model__max_depth': randint(3, 10),
        'model__subsample': uniform(0.5, 0.5)
    },
    "XGBoost": {
        'model__n_estimators': randint(100, 500),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__max_depth': randint(3, 10),
        'model__colsample_bytree': uniform(0.5, 0.5),
        'model__gamma': uniform(0, 0.5)
    },
    "SVM RBF": {
        'model__C': loguniform(1e0, 1e3),
        'model__gamma': loguniform(1e-4, 1e-1),
        'model__epsilon': uniform(0.01, 0.5)
    }
}

# 2. Función para realizar la búsqueda
def tune_hyperparameters(model_name, base_model, X, y):
    # Crear pipeline
    pipeline = SalaryPredictionPipeline(model=base_model).build_pipeline()
    
    # Configurar búsqueda
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions[model_name],
        n_iter=50,
        cv=5,
        scoring='neg_root_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )
    
    # Ejecutar búsqueda
    search.fit(X, y)
    
    return search

# 3. Ejecutar para cada modelo
best_models = {}
for model_name, model in non_linear_models.items():
    print(f"\n=== Optimizando {model_name} ===")
    search_result = tune_hyperparameters(model_name, model, X_train, y_train)
    
    # Guardar mejor modelo
    best_models[model_name] = {
        'model': search_result.best_estimator_,
        'params': search_result.best_params_,
        'score': search_result.best_score_
    }
    
    print(f"Mejores parámetros ({model_name}):")
    print(search_result.best_params_)
    print(f"RMSE promedio (CV): {-search_result.best_score_:.4f}")

# 4. Función para evaluar los mejores modelos
def evaluate_best_models(best_models_dict, X_test, y_test):
    results = {}
    for name, model_info in best_models_dict.items():
        model = model_info['model']
        y_pred = model.predict(X_test)
        
        results[name] = {
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'R2': r2_score(y_test, y_pred),
            'Params': model_info['params']
        }
    return pd.DataFrame(results).T

# 5. Evaluación final
final_results = evaluate_best_models(best_models, X_test, y_test)
print("\nResultados Finales:")
print(final_results.sort_values('RMSE'))


=== Optimizando Random Forest ===
Mejores parámetros (Random Forest):
{'model__max_depth': 10, 'model__max_features': 'log2', 'model__min_samples_split': 19, 'model__n_estimators': 317}
RMSE promedio (CV): 0.4089

=== Optimizando Gradient Boosting ===
Mejores parámetros (Gradient Boosting):
{'model__learning_rate': np.float64(0.03102740950912839), 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': np.float64(0.6424202471887338)}
RMSE promedio (CV): 0.4079

=== Optimizando XGBoost ===
Mejores parámetros (XGBoost):
{'model__colsample_bytree': np.float64(0.918855052953664), 'model__gamma': np.float64(0.33784505851964036), 'model__learning_rate': np.float64(0.23056483577223164), 'model__max_depth': 4, 'model__n_estimators': 250}
RMSE promedio (CV): 0.4080

=== Optimizando SVM RBF ===
Mejores parámetros (SVM RBF):
{'model__C': np.float64(2.2844556850020523), 'model__epsilon': np.float64(0.3666223936114975), 'model__gamma': np.float64(0.019158219548093176)}
RMSE promedio




Resultados Finales:
                       RMSE        R2  \
Gradient Boosting  0.415292  0.281148   
Random Forest      0.415508  0.280397   
XGBoost            0.417443  0.273679   
SVM RBF            0.419149  0.267733   

                                                              Params  
Gradient Boosting  {'model__learning_rate': 0.03102740950912839, ...  
Random Forest      {'model__max_depth': 10, 'model__max_features'...  
XGBoost            {'model__colsample_bytree': 0.918855052953664,...  
SVM RBF            {'model__C': 2.2844556850020523, 'model__epsil...  


In [123]:
gbr = GradientBoostingRegressor(n_estimators=200, max_depth=3, subsample=0.6424202471887338, learning_rate=0.03102740950912839)
get_metrics_by_model(gbr)




Métricas para GradientBoostingRegressor:
CV RMSE: 0.4079 (±0.0090)
Test R²: 0.2822
Test RMSE: 0.4150
Test R² (original): 0.2245
Test RMSE (original): 1095340.41


