In [1]:
import os
os.chdir("..")

In [2]:
from app.pipelines import SalaryPredictionPipeline
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from app.salary_predictor import SalaryPredictor

In [4]:
df = pd.read_csv(r"/workspace/data/tech_salaries_filtered_no_others.csv")
df['contrato'] = df['contrato'].replace({
    'Tercerizado (trabajo a través de consultora o agencia)': 'Contractor',
    'Freelance': 'Contractor',
    "Participación societaria en una cooperativa": 'Contractor',
})
X = df.drop('salario', axis=1)
y = df['salario']
y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.15, random_state=42
)

In [11]:
df.dedicacion.unique()

array(['Full-Time', 'Part-Time'], dtype=object)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=200, max_depth=3, subsample=0.6424202471887338, learning_rate=0.03102740950912839)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=200, max_depth=3, subsample=0.6424202471887338, learning_rate=0.03102740950912839)
pipeline = SalaryPredictionPipeline(gbr)
pipeline.build_pipeline()
pipeline.fit(X_train, y_train)

<app.pipelines.salary.SalaryPredictionPipeline at 0x7f649d0e0920>

In [None]:
import joblib
from pathlib import Path

# Guardar todo el objeto pipeline
MODEL_PATH = Path("models")
MODEL_PATH.mkdir(exist_ok=True)

joblib.dump(pipeline, MODEL_PATH / "salary_pipeline_v1.pkl")

# Opcional: Guardar metadata adicional
metadata = {
    "model_version": "1.0",
    "training_date": "2024-02-20",
    "features": pipeline.numerical_columns + pipeline.categorical_columns
}

joblib.dump(metadata, MODEL_PATH / "metadata.pkl")

['models/metadata.pkl']

In [None]:
# predictor.py
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

class SalaryPredictor:
    def __init__(self):
        self.model = None
        self.metadata = None
        self.load_model()
    
    def load_model(self, model_path="models/salary_pipeline_v1.pkl"):
        self.model = joblib.load(model_path)
        self.metadata = joblib.load(Path(model_path).parent / "metadata.pkl")
    
    def predict(self, input_data: dict) -> dict:
        df = pd.DataFrame([input_data])
        log_pred = self.model.predict(df)
        return {
            "prediccion_log": float(log_pred[0]),
            "salario_estimado": float(np.expm1(log_pred[0])),
            "moneda": "MXN",
            "version_modelo": self.metadata["model_version"]
        }

In [None]:
df.modalidad_de_trabajo.unique

<bound method Series.unique of 0                         100% remoto
1                         100% remoto
2                         100% remoto
3                         100% remoto
4                         100% remoto
                    ...              
3330    Híbrido (presencial y remoto)
3331    Híbrido (presencial y remoto)
3332                      100% remoto
3333    Híbrido (presencial y remoto)
3334    Híbrido (presencial y remoto)
Name: modalidad_de_trabajo, Length: 3335, dtype: object>

In [3]:
from app.data_models import SalaryRequest

In [4]:
# Input del usuario (mismo formato que espera la pipeline)
user_data = {
    'dedicacion': 'Full-Time',
    'contrato': 'Contractor',
    'cantidad_de_personas_en_tu_organizacion': 'De 201 a 500 personas',
    'modalidad_de_trabajo': '100% remoto',
    'seniority': 'Junior',
    'marvin_rol': 2,
    'anos_de_experiencia': 0,
    'antiguedad_en_la_empresa_actual': 0,
    'anos_en_el_puesto_actual': 1,
    'cuantas_personas_tenes_a_cargo': 0,
    'edad': 20
}

## Predicción
#salario_predicho = predict_salary(user_data)
#print(f"Salario estimado: ${salario_predicho:,.2f} pesos")

In [5]:
sr = SalaryRequest.model_validate(user_data)

In [6]:
SalaryPredictor().predict(sr.model_dump())



PredictionResponse(prediccion_log=13.956762528289618, salario_estimado=1151713.8093824012, moneda='ARS', version_modelo='1.0')