In [1]:
import os
os.chdir("..")

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Any
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, MultiTaskElasticNetCV, LassoLars, BayesianRidge, ARDRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score  
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import random
from collections import Counter
from app.pipelines import SalaryPredictionPipeline

In [3]:
df = pd.read_csv(r"C:\Users\tomas\OneDrive\Desktop\Proyectos\Vanguard\GET_YOUR_SALARY\data\tech_salaries_filtered_no_others.csv")
df['contrato'] = df['contrato'].replace({
    'Tercerizado (trabajo a través de consultora o agencia)': 'Contractor',
    'Freelance': 'Contractor',
    "Participación societaria en una cooperativa": 'Contractor',
})
#df = df[df["dedicacion"] != "Part-Time"]
df["marvin_rol"] = df["marvin_rol"].astype("category")


In [4]:
#df.drop(columns=["dedicacion"], inplace=True)

In [5]:
def remove_outliers_iqr(df, num_columns, threshold=1.5):
    df_clean = df.copy()
    
    rows_original = len(df_clean)
    
    valid_rows = pd.Series(True, index=df_clean.index)

    for col in num_columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        col_mask = (df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)
        
        outliers_count = (~col_mask).sum()
        print(f"Columna '{col}': {outliers_count} outliers detectados")
        print(f"  - Límite inferior: {lower_bound:.2f}")
        print(f"  - Límite superior: {upper_bound:.2f}")
        
        valid_rows = valid_rows & col_mask
    
    df_clean = df_clean[valid_rows]
    
    rows_removed = rows_original - len(df_clean)
    print(f"\nSe eliminaron {rows_removed} filas con outliers ({rows_removed/rows_original:.2%} del dataset)")
    print(f"Dataset original: {rows_original} filas")
    print(f"Dataset limpio: {len(df_clean)} filas")
    
    return df_clean

In [6]:
df = remove_outliers_iqr(df,['salario', 'anos_de_experiencia', 'antiguedad_en_la_empresa_actual','edad'])

Columna 'salario': 60 outliers detectados
  - Límite inferior: -1018862.80
  - Límite superior: 5964771.33
Columna 'anos_de_experiencia': 90 outliers detectados
  - Límite inferior: -10.50
  - Límite superior: 25.50
Columna 'antiguedad_en_la_empresa_actual': 357 outliers detectados
  - Límite inferior: -3.50
  - Límite superior: 8.50
Columna 'edad': 45 outliers detectados
  - Límite inferior: 12.50
  - Límite superior: 56.50

Se eliminaron 469 filas con outliers (14.06% del dataset)
Dataset original: 3335 filas
Dataset limpio: 2866 filas


In [7]:
X = df.drop('salario', axis=1)
y = df['salario']
y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.15, random_state=42
)

In [8]:
train_set = pd.concat([X_train, y_train], axis=1)

In [9]:
print("Distribución original de seniority:")
print(train_set['seniority'].value_counts())
print("\n")

cat_columns = ['dedicacion','contrato', 'cantidad_de_personas_en_tu_organizacion', 'modalidad_de_trabajo', 'seniority', 'marvin_rol']
num_columns = ['salario', 'anos_de_experiencia', 'antiguedad_en_la_empresa_actual', 
              'anos_en_el_puesto_actual', 'cuantas_personas_tenes_a_cargo', 'edad']

cat_value_freqs = {}
for col in cat_columns:
    if col != 'seniority':  
        cat_value_freqs[col] = train_set[col].value_counts(normalize=True).to_dict()

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(train_set[cat_columns])
cat_encoded_train_set = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out(cat_columns))

X = pd.concat([train_set[num_columns].reset_index(drop=True), cat_encoded_train_set.reset_index(drop=True)], axis=1)
y = train_set['seniority']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


def generate_synthetic_cat_value(column_name):
    options = list(cat_value_freqs[column_name].keys())
    weights = list(cat_value_freqs[column_name].values())
    return random.choices(options, weights=weights)[0]


num_resampled = X_resampled[num_columns].copy()
cat_columns_encoded = [col for col in X_resampled.columns if col not in num_columns]
train_set_balanced = pd.DataFrame()

for col in num_columns:
    train_set_balanced[col] = num_resampled[col].values

train_set_balanced['seniority'] = y_resampled

for col in cat_columns:
    if col != 'seniority':
        synthetic_values = []
        for i in range(len(train_set_balanced)):
            if i < len(train_set):  #
                synthetic_values.append(train_set[col].iloc[i])
            else:  
                synthetic_values.append(generate_synthetic_cat_value(col))
        train_set_balanced[col] = synthetic_values



Distribución original de seniority:
seniority
Semi-Senior         890
Senior              851
Junior              420
Manager or Above    275
Name: count, dtype: int64




In [10]:
X_train = train_set_balanced.drop('salario', axis=1)
y_train = train_set_balanced['salario']

In [11]:
from typing import Any

def get_metrics_by_model(model: Any, use_original_scale: bool = True) -> None:
    pipeline = SalaryPredictionPipeline(model=model)
    model_pipeline = pipeline.build_pipeline()
    
    cv_scores = cross_val_score(
        model_pipeline,
        X_train,
        y_train,
        cv=5,
        scoring='neg_root_mean_squared_error'
    )
    
    model_pipeline_fitted = model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline_fitted.predict(X_test)

    metrics = {
        'cv_rmse_mean': abs(cv_scores.mean()),
        'cv_rmse_std': cv_scores.std(),
        'r2': model_pipeline_fitted.score(X_test, y_test),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred))
    }
    
    if use_original_scale:
        y_test_orig = np.expm1(y_test)
        y_pred_orig = np.expm1(y_pred)
        metrics.update({
            'r2_original': r2_score(y_test_orig, y_pred_orig),
            'rmse_original': np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
        })
    
    print(f"\nMétricas para {model.__class__.__name__}:")
    print(f"CV RMSE: {metrics['cv_rmse_mean']:.4f} (±{metrics['cv_rmse_std']:.4f})")
    print(f"Test R²: {metrics['r2']:.4f}")
    print(f"Test RMSE: {metrics['rmse']:.4f}")
    if use_original_scale:
        print(f"Test R² (original): {metrics['r2_original']:.4f}")
        print(f"Test RMSE (original): {metrics['rmse_original']:.2f}")

In [12]:
gbr = GradientBoostingRegressor(n_estimators=200, max_depth=3, subsample=0.6424202471887338, learning_rate=0.03102740950912839)
get_metrics_by_model(gbr)




Métricas para GradientBoostingRegressor:
CV RMSE: 0.3715 (±0.0313)
Test R²: 0.3589
Test RMSE: 0.3928
Test R² (original): 0.2997
Test RMSE (original): 999913.59




In [13]:
lr = RandomForestRegressor()
get_metrics_by_model(lr)




Métricas para RandomForestRegressor:
CV RMSE: 0.3934 (±0.0527)
Test R²: 0.2559
Test RMSE: 0.4232
Test R² (original): 0.2231
Test RMSE (original): 1053218.73




In [41]:
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [4, 5],
    'subsample': [0.7, 0.8],
    'learning_rate': [0.001, 0.01, 0.03],  # Más opciones
    'min_samples_split': [5, 10],          # Rango extendido
    'loss': ['absolute_error', 'squared_error'],            # Nuevos hiperparámetros
    'criterion': ['squared_error'],             
    'n_iter_no_change': [5, 10],               # Early stopping
    'validation_fraction': [0.1, 0.2]
}

In [47]:
import joblib
from pathlib import Path

gbr = GradientBoostingRegressor(n_estimators=300, max_depth=4, subsample=0.7, learning_rate=0.03, loss='squared_error', criterion='squared_error', min_samples_split=5, n_iter_no_change=10, validation_fraction=0.2)
pipeline = SalaryPredictionPipeline(gbr)
pipeline.build_pipeline()
pipeline.fit(X_train, y_train)

MODEL_PATH = Path("models")
MODEL_PATH.mkdir(exist_ok=True)

joblib.dump(pipeline, MODEL_PATH / "salary_pipeline_v3.pkl")

metadata = {
    "model_version": "3.0",
    "training_date": "2025-04-18",
    "features": pipeline.numerical_columns + pipeline.categorical_columns
}

joblib.dump(metadata, MODEL_PATH / "metadata_v3.pkl")

['models\\metadata_v3.pkl']

In [42]:
pipeline = SalaryPredictionPipeline()
pipeline.build_pipeline()
pipeline.fit(X_train, y_train)
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)



In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()

grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Puedes cambiar a 'r2' u otra métrica
    cv=5,
    n_jobs=-1,  # Paralelizar usando todos los núcleos
    verbose=2
)

grid_search.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


In [44]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor score (MSE negativo):", grid_search.best_score_)

best_model = grid_search.best_estimator_

Mejores hiperparámetros: {'criterion': 'squared_error', 'learning_rate': 0.03, 'loss': 'squared_error', 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 200, 'n_iter_no_change': 5, 'subsample': 0.7, 'validation_fraction': 0.2}
Mejor score (MSE negativo): -0.12932888752367788


In [45]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_model.predict(X_test_transformed)
print("MSE en test:", mean_squared_error(y_test, y_pred))
print("R² en test:", r2_score(y_test, y_pred))

MSE en test: 0.14144561485015936
R² en test: 0.4123910666401448


In [46]:
y_test_orig = np.expm1(y_test)
y_pred_orig = np.expm1(y_pred)
np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))


np.float64(958394.1937960862)

Mejores hiperparámetros: {'learning_rate': 0.03, 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300, 'subsample': 0.7}

np.float64(961021.059179427)

MSE en test: 0.14239679074316042
R² en test: 0.4084395871084813

Mejores hiperparámetros: {'criterion': 'squared_error', 'learning_rate': 0.03, 'loss': 'squared_error', 'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 300, 'n_iter_no_change': 10, 'subsample': 0.7, 'validation_fraction': 0.2