**RETO INGENIERÍA DE CARACTERÍSTICAS**


In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [None]:
Empleados = "empleadosRETO.csv"

**Leyendo archivo cargado de Empleados**

In [None]:
EmpleadosAttrition = pd.read_csv(Empleados)

print("Shape:", EmpleadosAttrition.shape)
display(EmpleadosAttrition.head())
print("\nColumnas:\n", EmpleadosAttrition.columns.tolist())

Shape: (400, 30)


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,1,997,4,Male,...,22,4,3,80,32,1,2,4,1,No
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,1,178,2,Male,...,20,4,4,80,7,0,3,2,0,No
2,21,Travel_Rarely,Sales,7 km,1,Marketing,1,1780,2,Male,...,13,3,2,80,1,3,3,0,1,Yes
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,1,1118,2,Male,...,19,3,4,80,18,4,3,6,4,No
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,1,582,2,Male,...,12,3,4,80,15,2,4,6,7,Yes



Columnas:
 ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'HiringDate', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'Attrition']


**Elimina columnas irrelevantes**

In [None]:
EmpleadosAttrition.columns = EmpleadosAttrition.columns.str.strip()
cols_drop_basicas = ["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"]
EmpleadosAttrition = EmpleadosAttrition.drop(columns=cols_drop_basicas)

print("Shape tras eliminación básica:", EmpleadosAttrition.shape)

Shape tras eliminación básica: (400, 26)


**## Crea Year (año de contratación) desde HiringDate**

In [None]:
EmpleadosAttrition["HiringDate_dt"] = pd.to_datetime(
    EmpleadosAttrition["HiringDate"],
    errors="coerce",
    infer_datetime_format=True
)

EmpleadosAttrition["Year"] = EmpleadosAttrition["HiringDate_dt"].dt.year.astype("Int64")
print(EmpleadosAttrition[["HiringDate", "HiringDate_dt", "Year"]].head())

   HiringDate HiringDate_dt  Year
0  06/06/2013    2013-06-06  2013
1  12/25/2015    2015-12-25  2015
2   2/14/2017    2017-02-14  2017
3   7/29/2010    2010-07-29  2010
4  10/07/2011    2011-10-07  2011


  EmpleadosAttrition["HiringDate_dt"] = pd.to_datetime(


**Crea YearsAtCompany (años en la empresa hasta 2018)**

In [None]:
EmpleadosAttrition["YearsAtCompany"] = (2018 - EmpleadosAttrition["Year"]).astype("Int64")
print(EmpleadosAttrition[["Year", "YearsAtCompany"]].describe(include="all"))

              Year  YearsAtCompany
count        399.0           399.0
mean   2010.744361        7.255639
std       6.010026        6.010026
min         1985.0             0.0
25%         2008.0             3.0
50%         2013.0             5.0
75%         2015.0            10.0
max         2018.0            33.0


**Renombra DistanceFromHome a DistanceFromHome_km y crea DistanceFromHome entera**

In [None]:
EmpleadosAttrition = EmpleadosAttrition.rename(columns={"DistanceFromHome": "DistanceFromHome_km"})

EmpleadosAttrition["DistanceFromHome"] = (
    EmpleadosAttrition["DistanceFromHome_km"]
    .astype(str)
    .str.extract(r"(\d+)", expand=False)
    .astype(float)
    .astype("Int64")
)

print(EmpleadosAttrition[["DistanceFromHome_km", "DistanceFromHome"]].head())

  DistanceFromHome_km  DistanceFromHome
0                1 km                 1
1                6 km                 6
2                7 km                 7
3                7 km                 7
4               15 km                15


**Borra Year, HiringDate y DistanceFromHome_km (ya no son útiles)**

In [None]:
print(EmpleadosAttrition.shape)
print(EmpleadosAttrition.head(10))

EmpleadosAttrition = EmpleadosAttrition.drop(columns=["Year", "HiringDate", "DistanceFromHome_km", "HiringDate_dt"])
print("Shape tras borrar derivadas no útiles:", EmpleadosAttrition.shape)

(400, 30)
   Age     BusinessTravel              Department DistanceFromHome_km  \
0   50      Travel_Rarely  Research & Development                1 km   
1   36      Travel_Rarely  Research & Development                6 km   
2   21      Travel_Rarely                   Sales                7 km   
3   52      Travel_Rarely  Research & Development                7 km   
4   33      Travel_Rarely  Research & Development               15 km   
5   47      Travel_Rarely  Research & Development                4 km   
6   22      Travel_Rarely  Research & Development                1 km   
7   33         Non-Travel                   Sales               16 km   
8   40      Travel_Rarely                   Sales                4 km   
9   27  Travel_Frequently                   Sales                2 km   

   Education EducationField  EnvironmentSatisfaction  Gender  JobInvolvement  \
0          2        Medical                        4    Male               3   
1          2        Medica

**Tabla informativa: sueldo promedio por departamento**

In [None]:
SueldoPromedioDepto = (
    EmpleadosAttrition
    .groupby("Department", as_index=False)["MonthlyIncome"]
    .mean()
)

SueldoPromedio = SueldoPromedioDepto
display(SueldoPromedio)

Unnamed: 0,Department,MonthlyIncome
0,Human Resources,6239.888889
1,Research & Development,6804.149813
2,Sales,7188.25


**Escala MonthlyIncome entre 0 y 1 (Min-Max)**

In [None]:
scaler = MinMaxScaler()
EmpleadosAttrition["MonthlyIncome"] = scaler.fit_transform(EmpleadosAttrition[["MonthlyIncome"]])

print(EmpleadosAttrition["MonthlyIncome"].describe())

count    400.000000
mean       0.311196
std        0.258308
min        0.000000
25%        0.106939
50%        0.222711
75%        0.462745
max        1.000000
Name: MonthlyIncome, dtype: float64


**Convierte variables categóricas a numéricas (One-Hot / binaria)**

In [None]:
# 1) Attrition a binaria
print(EmpleadosAttrition.head(10))

EmpleadosAttrition["Attrition"] = EmpleadosAttrition["Attrition"].map({"Yes": 1, "No": 0}).astype(int)

# 2) One-hot al resto de columnas tipo object (categóricas)
cat_cols = EmpleadosAttrition.select_dtypes(include=["object"]).columns.tolist()
print("Categóricas a codificar:", cat_cols)

EmpleadosAttrition = pd.get_dummies(EmpleadosAttrition, columns=cat_cols, drop_first=False)

print("Shape tras one-hot:", EmpleadosAttrition.shape)


   Age     BusinessTravel              Department  Education EducationField  \
0   50      Travel_Rarely  Research & Development          2        Medical   
1   36      Travel_Rarely  Research & Development          2        Medical   
2   21      Travel_Rarely                   Sales          1      Marketing   
3   52      Travel_Rarely  Research & Development          4  Life Sciences   
4   33      Travel_Rarely  Research & Development          1        Medical   
5   47      Travel_Rarely  Research & Development          3  Life Sciences   
6   22      Travel_Rarely  Research & Development          2  Life Sciences   
7   33         Non-Travel                   Sales          3  Life Sciences   
8   40      Travel_Rarely                   Sales          4      Marketing   
9   27  Travel_Frequently                   Sales          1  Life Sciences   

   EnvironmentSatisfaction  Gender  JobInvolvement  JobLevel  \
0                        4    Male               3         4   
1 

**Correlación lineal de cada variable respecto a Attrition**

In [None]:
corr_con_attrition = EmpleadosAttrition.corr(numeric_only=True)["Attrition"].sort_values(ascending=False)
display(corr_con_attrition)

Unnamed: 0,Attrition
Attrition,1.0
OverTime_Yes,0.324777
MaritalStatus_Single,0.205849
JobRole_Sales Representative,0.191294
EducationField_Technical Degree,0.129104
JobRole_Laboratory Technician,0.125264
Department_Sales,0.066116
DistanceFromHome,0.052732
EducationField_Human Resources,0.043404
BusinessTravel_Travel_Rarely,0.042755


**Selecciona variables con correlación >= 0.1 y crea EmpleadosAttritionFinal**

In [None]:
umbral = 0.1

cols_seleccionadas = corr_con_attrition.index[(corr_con_attrition.abs() >= umbral)].tolist()
if "Attrition" not in cols_seleccionadas:
    cols_seleccionadas.append("Attrition")

EmpleadosAttritionFinal = EmpleadosAttrition[cols_seleccionadas].copy()

print("Columnas seleccionadas:", len(cols_seleccionadas))
print("Shape EmpleadosAttritionFinal:", EmpleadosAttritionFinal.shape)
print(EmpleadosAttrition.shape)


Columnas seleccionadas: 20
Shape EmpleadosAttritionFinal: (400, 20)
(400, 47)


**PCA sobre EmpleadosAttritionFinal (numpy array)**

In [None]:
#print(EmpleadosAttrition.head(10))
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
imputer = SimpleImputer(strategy="median")

X_final_imp = imputer.fit_transform(X_final)

#X_final = EmpleadosAttritionFinal.drop(columns=["Attrition"])
#y_final = EmpleadosAttritionFinal["Attrition"].values

pca = PCA()
EmpleadosAttritionPCA = pca.fit_transform(X_final_imp)  # numpy array

print("Shape PCA:", EmpleadosAttritionPCA.shape)
print("Varianza explicada acumulada (primeros 10):", np.cumsum(pca.explained_variance_ratio_)[:10])

Shape PCA: (400, 19)
Varianza explicada acumulada (primeros 10): [0.63385263 0.87519676 0.9539784  0.97497033 0.98148841 0.98789897
 0.99088165 0.9936744  0.99569756 0.99700986]


**Agrega el mínimo número de componentes para explicar 80% de varianza (C0, C1, …)**

In [None]:
var_acum = np.cumsum(pca.explained_variance_ratio_)
k = int(np.argmax(var_acum >= 0.80) + 1)  # mínimo k para >= 80%

print("Componentes necesarios para >=80%:", k, " (var_acum =", var_acum[k-1], ")")

for i in range(k):
    EmpleadosAttritionFinal = EmpleadosAttritionFinal.assign(**{f"C{i}": EmpleadosAttritionPCA[:, i]})

cols = EmpleadosAttritionFinal.columns.tolist()
cols_sin_components = [c for c in cols if not c.startswith("C")]
components = [c for c in cols if c.startswith("C")]
# Mantener Attrition en el frame (si existe) y componentes al final
if "Attrition" in cols_sin_components:
    cols_sin_components.remove("Attrition")
    nuevo_orden = ["Attrition"] + cols_sin_components + components
else:
    nuevo_orden = cols_sin_components + components

EmpleadosAttritionFinal = EmpleadosAttritionFinal[nuevo_orden]
print("Shape final:", EmpleadosAttritionFinal.shape)


Componentes necesarios para >=80%: 2  (var_acum = 0.8751967579946857 )
Shape final: (400, 22)


**Guarda el CSV final EmpleadosAttritionFinal.csv (sin índice)**

In [None]:
EmpleadosAttritionFinal.to_csv("EmpleadosAttritionFinal.csv", index=False)
print("Archivo generado: EmpleadosAttritionFinal.csv")

Archivo generado: EmpleadosAttritionFinal.csv
