In [1]:
import pandas as pd 
import os
os.chdir("../")

In [2]:
df = pd.read_csv(r"data\tech_salaries_filtered_no_others.csv")

In [8]:
df

Unnamed: 0,dedicacion,contrato,salario,anos_de_experiencia,antiguedad_en_la_empresa_actual,anos_en_el_puesto_actual,cuantas_personas_tenes_a_cargo,cantidad_de_personas_en_tu_organizacion,modalidad_de_trabajo,edad,seniority,marvin_rol
0,Full-Time,Staff (planta permanente),3952805.00,3,3,1,2,De 201 a 500 personas,100% remoto,29,Semi-Senior,5
1,Part-Time,Staff (planta permanente),1606000.00,5,2,2,0,De 1001 a 2000 personas,100% remoto,25,Semi-Senior,4
2,Full-Time,Staff (planta permanente),4000000.00,25,3,3,5,De 2 a 10 personas,100% remoto,50,Senior,3
3,Full-Time,Contractor,3000000.00,18,8,6,0,De 2 a 10 personas,100% remoto,41,Senior,3
4,Full-Time,Staff (planta permanente),3953142.06,15,5,4,3,De 201 a 500 personas,100% remoto,47,Senior,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3330,Full-Time,Staff (planta permanente),2100000.00,4,2,2,0,De 2001a 5000 personas,Híbrido (presencial y remoto),36,Semi-Senior,2
3331,Full-Time,Contractor,1600000.00,1,1,1,0,De 1001 a 2000 personas,Híbrido (presencial y remoto),23,Junior,4
3332,Full-Time,Contractor,3000000.00,3,3,3,3,De 101 a 200 personas,100% remoto,28,Semi-Senior,3
3333,Full-Time,Staff (planta permanente),5000000.00,20,15,4,1,De 201 a 500 personas,Híbrido (presencial y remoto),58,Senior,3


In [6]:
df['contrato'] = df['contrato'].replace({
    'Tercerizado (trabajo a través de consultora o agencia)': 'Contractor',
    'Freelance': 'Contractor',
    "Participación societaria en una cooperativa": 'Contractor',
})

In [11]:
df["seniority"].unique()

array(['Semi-Senior', 'Senior', 'Junior', 'Manager or Above'],
      dtype=object)

In [4]:
df.head().to_dict()

{'dedicacion': {0: 'Full-Time',
  1: 'Part-Time',
  2: 'Full-Time',
  3: 'Full-Time',
  4: 'Full-Time'},
 'contrato': {0: 'Staff (planta permanente)',
  1: 'Staff (planta permanente)',
  2: 'Staff (planta permanente)',
  3: 'Freelance',
  4: 'Staff (planta permanente)'},
 'salario': {0: 3952805.0,
  1: 1606000.0,
  2: 4000000.0,
  3: 3000000.0,
  4: 3953142.06},
 'anos_de_experiencia': {0: 3, 1: 5, 2: 25, 3: 18, 4: 15},
 'antiguedad_en_la_empresa_actual': {0: 3, 1: 2, 2: 3, 3: 8, 4: 5},
 'anos_en_el_puesto_actual': {0: 1, 1: 2, 2: 3, 3: 6, 4: 4},
 'cuantas_personas_tenes_a_cargo': {0: 2, 1: 0, 2: 5, 3: 0, 4: 3},
 'cantidad_de_personas_en_tu_organizacion': {0: 'De 201 a 500 personas',
  1: 'De 1001 a 2000 personas',
  2: 'De 2 a 10 personas',
  3: 'De 2 a 10 personas',
  4: 'De 201 a 500 personas'},
 'modalidad_de_trabajo': {0: '100% remoto',
  1: '100% remoto',
  2: '100% remoto',
  3: '100% remoto',
  4: '100% remoto'},
 'edad': {0: 29, 1: 25, 2: 50, 3: 41, 4: 47},
 'seniority': {0: '

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
import numpy as np
from typing_extensions import Self
from typing import Any

In [21]:
Array = pd.DataFrame | pd.Series | np.ndarray

In [22]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns: list[str]):
        self.columns = columns
        
    def fit(self, X: pd.DataFrame, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        return X[self.columns]


In [23]:
class SeniorityTransformer(BaseEstimator, TransformerMixin):
    seniority_mapping: dict[str, int] = {
        'Junior': 1, 
        'Semi-Senior': 2, 
        'Senior': 3, 
        'Manager or Above': 4
    }
    
    def fit(self, X: Array, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: Array) -> pd.DataFrame:
        X_copy = X.copy()
        X_copy = pd.DataFrame(X_copy, columns=['seniority'])
        X_copy['seniority_numeric'] = X_copy['seniority'].map(self.seniority_mapping)
        return X_copy[['seniority_numeric']]

In [24]:
class OrganizationSizeTransformer(BaseEstimator, TransformerMixin):
    size_mapping = {
        '1 (solamente yo)': 1,
        'De 2 a 10 personas': 6,  # promedio de 2 y 10
        'De 11 a 50 personas': 30,
        'De 51 a 100 personas': 75,
        'De 101 a 200 personas': 150,
        'De 201 a 500 personas': 350,
        'De 501 a 1000 personas': 750,
        'De 1001 a 2000 personas': 1500,
        'De 2001a 5000 personas': 3500,
        'De 5001 a 10000 personas': 7500,
        'Más de 10000 personas': 15000
    }
    
    def fit(self, X: Array, y: Array | None = None) -> Self:
        return self
    
    def transform(self, X: Array) -> pd.DataFrame:
        X_copy = X.copy()
        X_copy = pd.DataFrame(X_copy, columns=['cantidad_de_personas_en_tu_organizacion'])
        X_copy['org_size_numeric'] = X_copy['cantidad_de_personas_en_tu_organizacion'].map(self.size_mapping)
        return X_copy[['org_size_numeric']]


In [28]:


class SalaryPredictionPipeline:
    """Pipeline completa para la predicción de salarios."""
    
    def __init__(self, model: Any | None = None):
        self.model = model
        self.pipeline: Pipeline| None = None
        self.categorical_columns: list[str] = [
            'dedicacion', 'contrato', 'cantidad_de_personas_en_tu_organizacion',
            'modalidad_de_trabajo', 'seniority'
        ]
        self.numerical_columns: list[str] = [
            'anos_de_experiencia', 'antiguedad_en_la_empresa_actual',
            'anos_en_el_puesto_actual', 'cuantas_personas_tenes_a_cargo', 'edad', 'marvin_rol'
        ]
        self.target_column: str = 'salario'
        
    def build_pipeline(self):
        numerical_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.numerical_columns)),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        categorical_basic_cols = ['dedicacion', 'contrato', 'modalidad_de_trabajo']
        categorical_basic_pipeline = Pipeline([
            ('selector', DataFrameSelector(categorical_basic_cols)),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        seniority_pipeline = Pipeline([
            ('selector', DataFrameSelector(['seniority'])),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('transformer', SeniorityTransformer())
        ])
        
        org_size_pipeline = Pipeline([
            ('selector', DataFrameSelector(['cantidad_de_personas_en_tu_organizacion'])),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('transformer', OrganizationSizeTransformer())
        ])
        preprocessor = ColumnTransformer(
            transformers=[
                ('numerical', numerical_pipeline, self.numerical_columns),
                ('categorical_basic', categorical_basic_pipeline, categorical_basic_cols),
                ('seniority', seniority_pipeline, ['seniority']),
                ('org_size', org_size_pipeline, ['cantidad_de_personas_en_tu_organizacion'])
            ]
        )
        if self.model:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('model', self.model)
            ])
        else:
            self.pipeline = Pipeline([
                ('preprocessor', preprocessor)
            ])
            
        return self.pipeline
    
    def fit(self, X: pd.DataFrame, y: Array) -> Self:
        """Entrena la pipeline con los datos proporcionados."""
        if self.pipeline is None:
            self.build_pipeline()
            
        self.pipeline.fit(X, y)
        return self
    
    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """Aplica las transformaciones de la pipeline sin predicción."""
        if self.pipeline is None:
            raise ValueError("La pipeline no ha sido construida o entrenada")
            
        return self.pipeline.transform(X)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Realiza predicciones utilizando el modelo entrenado."""
        if self.pipeline is None or self.model is None:
            raise ValueError("La pipeline completa con modelo no está disponible")
            
        return self.pipeline.predict(X)
    
    def save_pipeline(self, filepath: str | None = None):
        """Guarda la pipeline en un archivo."""
        if self.pipeline is None:
            raise ValueError("No hay pipeline para guardar")
            
        if filepath is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filepath = f"salary_prediction_pipeline_{timestamp}.joblib"
            
        directory = os.path.dirname(filepath)
        if directory and not os.path.exists(directory):
            os.makedirs(directory)
            
        joblib.dump(self.pipeline, filepath)
        return filepath
    
    @classmethod
    def load_pipeline(cls, filepath: str) -> Self:
        """Carga una pipeline desde un archivo."""
        pipeline_instance = cls()
        pipeline_instance.pipeline = joblib.load(filepath)
        
        steps = pipeline_instance.pipeline.steps
        pipeline_instance.model = steps[-1][1] if steps[-1][0] == 'model' else None
        
        return pipeline_instance
    
    def get_feature_names(self) -> list[str]:
        if self.pipeline is None:
            raise ValueError("La pipeline no ha sido construida")
            
        preprocessor = self.pipeline.named_steps['preprocessor']
        return preprocessor.get_feature_names_out()


In [None]:
X = df.drop('salario', axis=1)
y = df['salario']

pipeline = SalaryPredictionPipeline()
transformed_data = pipeline.build_pipeline().fit_transform(X)