In [None]:
import pandas as pd 
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
import numpy as np
from typing_extensions import Self
from typing import Any
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, MultiTaskElasticNetCV, LassoLars, BayesianRidge, ARDRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score  #
os.chdir("../")

In [None]:
df = pd.read_csv(r"data\tech_salaries_filtered_no_others.csv")

In [None]:
df = pd.read_csv(r"data\tech_salaries_filtered_no_others.csv")
df['contrato'] = df['contrato'].replace({
    'Tercerizado (trabajo a través de consultora o agencia)': 'Contractor',
    'Freelance': 'Contractor',
    "Participación societaria en una cooperativa": 'Contractor',
})
X = df.drop('salario', axis=1)
y = df['salario']
y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

### Modelos Lineales

In [None]:
get_metrics_by_model(regressor)

In [None]:
ridge = RidgeCV()
get_metrics_by_model(ridge)

In [None]:
lasso = LassoCV()
get_metrics_by_model(lasso)

In [None]:
elastic_net = ElasticNetCV()
get_metrics_by_model(elastic_net)

In [None]:
lasso_lars = LassoLars()
get_metrics_by_model(lasso_lars)

In [None]:
lasso_lars = BayesianRidge()
get_metrics_by_model(lasso_lars)

In [None]:
adr = ARDRegression()
get_metrics_by_model(adr)

### Modelos no lineales (y pipeline no lineal)

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
non_linear_models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=150, learning_rate=0.1),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=150),
    "SVM RBF": SVR(kernel='rbf', C=1.0, epsilon=0.1)
}

In [None]:
def get_metrics_by_model(model: Any, use_original_scale: bool = True) -> None:
    pipeline = SalaryPredictionPipeline(model=model)
    model_pipeline = pipeline.build_pipeline()
    
    # Validación cruzada para modelos no lineales
    cv_scores = cross_val_score(
        model_pipeline,
        X_train,
        y_train,
        cv=5,
        scoring='neg_root_mean_squared_error'
    )
    
    model_pipeline_fitted = model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline_fitted.predict(X_test)

    # Cálculo de métricas mejorado
    metrics = {
        'cv_rmse_mean': abs(cv_scores.mean()),
        'cv_rmse_std': cv_scores.std(),
        'r2': model_pipeline_fitted.score(X_test, y_test),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred))
    }
    
    if use_original_scale:
        y_test_orig = np.expm1(y_test)
        y_pred_orig = np.expm1(y_pred)
        metrics.update({
            'r2_original': r2_score(y_test_orig, y_pred_orig),
            'rmse_original': np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
        })
    
    print(f"\nMétricas para {model.__class__.__name__}:")
    print(f"CV RMSE: {metrics['cv_rmse_mean']:.4f} (±{metrics['cv_rmse_std']:.4f})")
    print(f"Test R²: {metrics['r2']:.4f}")
    print(f"Test RMSE: {metrics['rmse']:.4f}")
    if use_original_scale:
        print(f"Test R² (original): {metrics['r2_original']:.4f}")
        print(f"Test RMSE (original): {metrics['rmse_original']:.2f}")

In [None]:
for model_name, model in non_linear_models.items():
    print(f"\n{'='*40}")
    print(f"Evaluando {model_name}")
    get_metrics_by_model(model)
    print(f"{'='*40}")

### Best Model

In [None]:
gbr = GradientBoostingRegressor(n_estimators=200, max_depth=3, subsample=0.6424202471887338, learning_rate=0.03102740950912839)
get_metrics_by_model(gbr)

### Nuevo enfoque

In [None]:

user_input = {
    'dedicacion': 'Full-Time',
    'contrato': 'Staff (planta permanente)',
    'cantidad_de_personas_en_tu_organizacion': 'Más de 10000 personas',
    'modalidad_de_trabajo': '100% remoto',
    'seniority': 'Semi-Senior',
    'marvin_rol': 4,
    'anos_de_experiencia': 2,
    'antiguedad_en_la_empresa_actual': 2,
    'anos_en_el_puesto_actual': 2,
    'cuantas_personas_tenes_a_cargo': 0,
    'edad': 24
}

In [None]:
def predict_salary(user_data: dict, pipeline: SalaryPredictionPipeline) -> float:
    input_df = pd.DataFrame([user_data])
    log_pred = pipeline.predict(input_df)[0]
    return np.expm1(log_pred)

In [None]:
pipeline = SalaryPredictionPipeline(model=gbr)
pipeline.fit(X_train, y_train)

In [None]:
predicted_salary = predict_salary(user_input, pipeline)
print(f"Salario estimado: ${predicted_salary:,.2f} ARS")