In [35]:
# Importamos las librerías que necesitamos

# Tratamiento de datos
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Ocultamos las advertencias para evitar mensajes innecesarios
import warnings
warnings.filterwarnings("ignore")

# Configuración
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [36]:
# Lo primero que hacemos es cargar el dataframe que vamos a usar
df = pd.read_csv("spaces_hr_raw_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30.0,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,,1971,"199990,00$",,,1
2,2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220.0,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1
3,3,47,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,"14307,50$","37199,50$",3,Y,,19,30,2,Full Time,2,,2,,20,,5,6,"14307,50$",1976,"171690,00$",,,False
4,4,46,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,"12783,92$","33238,20$",2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,"12783,92$",1977,,,,0


In [37]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'over18', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager',
       'sameasmonthlyincome', 'datebirth', 'salary', 'roledepartament',
       'numberchildren', 'remotework'],
      dtype='object')

In [38]:
df = df.drop(columns=['numberchildren', 'over18', 'yearsincurrentrole'])

In [39]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'remotework'],
      dtype='object')

In [40]:
def exploracion_basica(df):
    print("=== EXPLORACIÓN BÁSICA DEL DATASET ===")
    print("\nPrimeras filas:")
    print(df.head())
    print("\nÚltimas filas:")
    print(df.tail())
    print("\nNúmero de filas y columnas:")
    print(df.shape)
    print("\nInformación general del DataFrame:")
    print(df.info())
    print("\nEstadísticas descriptivas:")
    print(df.describe())
    print("\nColumnas:")
    print(list(df.columns))

In [41]:
exploracion_basica(df)

=== EXPLORACIÓN BÁSICA DEL DATASET ===

Primeras filas:
   Unnamed: 0 age attrition businesstravel    dailyrate  \
0           0  51        No            NaN  2015.722222   
1           1  52        No            NaN  2063.388889   
2           2  42        No  travel_rarely  1984.253968   
3           3  47        No  travel_rarely  1771.404762   
4           4  46        No            NaN  1582.771346   

                 department  distancefromhome  education    educationfield  \
0                       NaN                 6          3               NaN   
1                       NaN                 1          4     Life Sciences   
2   Research & Development                  4          2  Technical Degree   
3                       NaN                 2          4           Medical   
4                       NaN                 3          3  Technical Degree   

   employeecount  employeenumber  environmentsatisfaction  gender  hourlyrate  \
0              1               1       

In [42]:
# Variables con valores nulos
def nulls(df,count=0,share=0):
    
    nulls = df.isnull().sum()
    nulls_share = df.isnull().sum()/df.shape[0]*100
    with_nulls_share = nulls_share[nulls_share > 0]
    
    if count == 1:
        print('Count of nulls')
        with_nulls = nulls[nulls > 0]
        print (with_nulls.sort_values(ascending=False))
        print('-----------------------------------')
    
    if share == 1:
        print('share of nulls')
        print (with_nulls_share.sort_values(ascending=False))
    
    nulls_list = with_nulls_share.to_frame(name='perc_nulos').reset_index().rename(columns={'index': 'var'})
    
    return nulls_list

In [43]:
nulls(df,1,1)

Count of nulls
department             1366
roledepartament        1366
hourlyrate             1267
businesstravel          801
educationfield          774
overtime                696
maritalstatus           675
totalworkingyears       549
monthlyincome           489
sameasmonthlyincome     489
standardhours           351
salary                  285
performancerating       200
worklifebalance         114
dtype: int64
-----------------------------------
share of nulls
department             81.406436
roledepartament        81.406436
hourlyrate             75.506555
businesstravel         47.735399
educationfield         46.126341
overtime               41.477950
maritalstatus          40.226460
totalworkingyears      32.717521
monthlyincome          29.141836
sameasmonthlyincome    29.141836
standardhours          20.917759
salary                 16.984505
performancerating      11.918951
worklifebalance         6.793802
dtype: float64


Unnamed: 0,var,perc_nulos
0,businesstravel,47.735399
1,department,81.406436
2,educationfield,46.126341
3,hourlyrate,75.506555
4,maritalstatus,40.22646
5,monthlyincome,29.141836
6,overtime,41.47795
7,performancerating,11.918951
8,standardhours,20.917759
9,totalworkingyears,32.717521


In [44]:
def detectar_duplicados(df):
    numero_duplicados = df.duplicated().sum()
    print(f"Filas duplicadas completas en el dataset: {numero_duplicados}")
    if numero_duplicados > 0:
        print("Aquí tienes las filas duplicadas:")
        print(df[df.duplicated()])

In [45]:
detectar_duplicados(df)

Filas duplicadas completas en el dataset: 0


### Exploramos columna por columna

In [46]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'remotework'],
      dtype='object')

### 'age'

In [47]:
df['age'].value_counts()

age
35              88
31              88
34              86
29              82
36              79
32              66
30              65
38              64
33              63
40              60
28              54
37              54
27              54
45              50
42              50
41              48
39              46
26              46
43              42
46              38
44              34
50              34
47              30
24              29
25              28
49              26
55              25
51              22
48              22
53              20
54              20
52              18
56              18
21              17
22              17
58              16
23              15
59              11
20              11
18               9
19               9
57               6
60               5
thirty-two       2
twenty-four      2
thirty           1
fifty-eight      1
fifty-two        1
twenty-six       1
fifty-five       1
thirty-seven     1
thirty-six       1
forty-se

In [48]:
# transformamos las edades escritas en palabras a números
words_to_numbers = {
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'twenty-one': 21,
    'twenty-two': 22,
    'twenty-three': 23,
    'twenty-four': 24,
    'twenty-five': 25,
    'twenty-six': 26,
    'twenty-seven': 27,
    'twenty-eight': 28,
    'twenty-nine': 29,
    'thirty': 30,
    'thirty-one': 31,
    'thirty-two': 32,
    'thirty-three': 33,
    'thirty-four': 34,
    'thirty-five': 35,
    'thirty-six': 36,
    'thirty-seven': 37,
    'forty-seven': 47,
    'fifty-two': 52,
    'fifty-five': 55,
    'fifty-eight': 58
}


In [49]:
# reemplazamos palabras por números, y convertimos la columna a tipo numérico
df['age'] = df['age'].replace(words_to_numbers)
df['age'] = pd.to_numeric(df['age'], errors='coerce')

print(df['age'].dtype)
print(df['age'].value_counts().sort_index())

int64
age
18     9
19     9
20    11
21    17
22    17
23    15
24    31
25    28
26    47
27    54
28    54
29    82
30    66
31    89
32    68
33    63
34    86
35    88
36    80
37    55
38    64
39    46
40    60
41    48
42    50
43    42
44    34
45    50
46    38
47    31
48    22
49    26
50    34
51    22
52    19
53    20
54    20
55    26
56    18
57     6
58    17
59    11
60     5
Name: count, dtype: int64


### 'attrition'

In [50]:
df['attrition'].value_counts()

attrition
No     1406
Yes     272
Name: count, dtype: int64

In [51]:
df['attrition'] = df['attrition'].str.lower()
print(df['attrition'].value_counts())

attrition
no     1406
yes     272
Name: count, dtype: int64


### 'businesstravel'

In [52]:
df['businesstravel'].value_counts()

businesstravel
travel_rarely        616
travel_frequently    168
non-travel            93
Name: count, dtype: int64

In [53]:
df['businesstravel'] = df['businesstravel'].str.replace(' ', '_', regex=False)
df['businesstravel'] = df['businesstravel'].str.replace('-', '_', regex=False).str.lower()
print(df['businesstravel'].value_counts())

businesstravel
travel_rarely        616
travel_frequently    168
non_travel            93
Name: count, dtype: int64


### 'dailyrate'

In [54]:
df['dailyrate'].value_counts()

dailyrate
556.256661     326
290.035510     308
1032.487286    150
1582.771346     55
1973.984127     38
              ... 
320.047619       1
470.063492       1
436.428571       1
501.325397       1
2004.785714      1
Name: count, Length: 673, dtype: int64

In [55]:
# reducimos a dos decimales los valores de la columna 'dailyrate'
df['dailyrate'] = df['dailyrate'].round(2)
print(df['dailyrate'].value_counts())

dailyrate
556.26     326
290.04     308
1032.49    150
1582.77     55
1973.98     38
          ... 
320.05       1
470.06       1
436.43       1
501.33       1
2004.79      1
Name: count, Length: 673, dtype: int64


### 'department'

In [56]:
df['department'].value_counts()

department
Research & Development     203
Sales                       93
Human Resources             16
Name: count, dtype: int64

In [57]:
df['department'].unique()

array([nan, ' Research & Development ', ' Sales ', ' Human Resources '],
      dtype=object)

In [58]:
df['department'] = df['department'].apply(
    lambda x: x.strip().replace(' ', '_').lower() if pd.notna(x) else x
)

print(df['department'].value_counts())

department
research_&_development    203
sales                      93
human_resources            16
Name: count, dtype: int64


In [59]:
df['department'].isnull().sum()

1366

In [60]:
df['department'].unique()

array([nan, 'research_&_development', 'sales', 'human_resources'],
      dtype=object)

### 'distancefromhome'

In [61]:
df['distancefromhome'].value_counts()

distancefromhome
 2     228
 1     209
 9      89
 10     86
 8      85
      ... 
-21      2
-34      2
-43      2
-39      1
-40      1
Name: count, Length: 69, dtype: int64

In [62]:
# pasamos la distancia a positivos, ya que hay valores negativos
df['distancefromhome'] = df['distancefromhome'].abs()
print(df['distancefromhome'].value_counts())

distancefromhome
2     228
1     209
10     92
9      89
8      85
7      82
3      79
5      65
4      64
6      63
24     37
16     37
12     33
13     33
11     33
18     32
15     31
25     31
26     30
29     30
23     30
14     28
19     23
20     23
17     22
21     22
22     21
28     20
27     18
47      8
35      8
38      7
31      6
36      6
42      6
45      6
48      5
37      5
46      5
32      4
41      4
30      4
33      3
44      3
34      2
49      2
43      2
39      1
40      1
Name: count, dtype: int64


### 'education'

In [63]:
df['education'].value_counts()

education
3    649
4    461
2    322
1    186
5     60
Name: count, dtype: int64

### 'educationfield'

In [64]:
df['educationfield'].value_counts()

educationfield
Life Sciences       367
Medical             286
Marketing           106
Technical Degree     70
Other                63
Human Resources      12
Name: count, dtype: int64

In [65]:
df['educationfield'] = (
    df['educationfield']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['educationfield'].value_counts())

educationfield
nan                 774
life_sciences       367
medical             286
marketing           106
technical_degree     70
other                63
human_resources      12
Name: count, dtype: int64


### 'employeecount'

In [66]:
df['employeecount'].value_counts()

employeecount
1    1678
Name: count, dtype: int64

In [67]:
# no aporta información útil, la eliminamos
df = df.drop(columns='employeecount')

### 'employeenumber'

In [68]:
df['employeenumber'].value_counts()

employeenumber
300     2
159     2
271     2
1158    2
191     2
       ..
547     1
546     1
545     1
544     1
1614    1
Name: count, Length: 1614, dtype: int64

In [70]:
df = df.drop(columns='Unnamed: 0')

In [71]:
df[df.duplicated(subset='employeenumber', keep=False)].sort_values('employeenumber')
# si sacamos la columna Unnamed: 0 podemos identificar las filas duplicadas - creo
    # voy a chequear -


Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,remotework
8,41,no,,1712.18,,2,5,,9,2,1,,3,4,mANAGEr,1,Married,"13829,17$","35955,83$",7,No,16,30,2,Full Time,1,220,2,30,18,11,8,"13829,17$",1982,"165950,00$",,True
1656,41,no,,1712.18,,2,5,,9,2,1,,3,4,mANAGEr,1,Married,"13829,17$","35955,83$",7,No,16,30,2,Full Time,1,220,2,30,18,11,8,"13829,17$",1982,"165950,00$",,True
60,36,no,,610.17,,5,2,,61,4,0,,3,2,lAboratORy TeChNiCiaN,2,Single,"4928,33$","12813,67$",8,No,16,30,4,Full Time,0,160,3,40,13,3,7,"4928,33$",1987,"59140,00$",,1
1652,36,no,,610.17,,5,2,,61,4,0,,3,2,lAboratORy TeChNiCiaN,2,Single,"4928,33$","12813,67$",8,No,16,30,4,Full Time,0,160,3,40,13,3,7,"4928,33$",1987,"59140,00$",,1
1676,47,no,travel_rarely,1032.49,,4,3,life_sciences,76,3,1,,2,3,maNufACTURING DIREctOr,2,Divorced,"8339,32$","21682,23$",8,Yes,12,,3,Part Time,1,,4,30,22,14,10,"8339,32$",1976,"100071,84$",,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,48,no,travel_rarely,417.96,,2,5,,1568,2,1,,3,2,sALES EXEcuTIVe,4,Married,"3375,83$","8777,17$",2,No,14,30,1,Part Time,1,140,2,30,9,6,7,"3375,83$",1975,"40510,00$",,True
1657,45,no,travel_rarely,495.75,,2,3,other,1569,4,0,,3,2,sAles executiVe,2,,,"10410,83$",0,No,19,30,2,Part Time,1,,3,40,8,3,7,,1978,"48050,00$",,False
1568,45,no,travel_rarely,495.75,,2,3,other,1569,4,0,,3,2,sAles executiVe,2,,,"10410,83$",0,No,19,30,2,Part Time,1,,3,40,8,3,7,,1978,"48050,00$",,False
1671,31,no,,556.26,,12,3,medical,1608,4,1,,3,2,HEaltHCarE REPreSENtAtIve,4,,,"11681,39$",0,,11,30,3,Part Time,2,100,2,10,9,8,5,,1992,"53914,11$",,True


In [73]:
df = df.drop_duplicates()

In [74]:
df['employeenumber'].is_unique

True

In [None]:
df[df.duplicated(subset='employeenumber', keep=False)].sort_values('employeenumber')
#obs . age = más de 100 borrarlos o emparejarlos con la edad menor de la base 
# y luego buscar los duplicados
#age=unnamed

In [None]:
print(f"Número de employeenumber duplicados: {df.duplicated(subset='employeenumber').sum()}")


### 'environmentsatisfaction'

In [None]:
df['environmentsatisfaction'].value_counts()

In [None]:
# detectamos valores no válidos, deberían ser del 1 al 4

df['environmentsatisfaction'] = df['environmentsatisfaction'].apply(
    lambda x: np.nan if x>4 else x
)

print(df['environmentsatisfaction'].value_counts())
# puede ser que acá esté el problema - cambiamos los que no están en el rango por nan, 
# no los sacamos de la muestra 

In [None]:
df['department'].isnull().sum()

In [None]:
# sustituimos los valores numéricos por texto para una mejor comprensión
satisfaction_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['environmentsatisfaction'] = df['environmentsatisfaction'].replace(satisfaction_map)
print(df['environmentsatisfaction'].value_counts())

In [None]:
df['environmentsatisfaction'] = (
    df['environmentsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['environmentsatisfaction'].value_counts())

### 'gender'

In [None]:
df['gender'].value_counts()

In [None]:
# típicamente, 0 es femenino y 1 es masculino. Aún así, comprobamos con la media de dailyrate, que suele ser más alta en hombres que en mujeres
df.groupby('gender')['dailyrate'].mean()


In [None]:
gender_map = {
    0: 'Female',
    1: 'Male'
}

df['gender'] = df['gender'].replace(gender_map)
df['gender'].value_counts()

In [None]:
df['gender'] = df['gender'].astype(str).str.lower()
print(df['gender'].value_counts())

### 'hourlyrate'

In [None]:
df['hourlyrate'].value_counts()

In [None]:
df['hourlyrate'] = df['hourlyrate'].round(2)

### 'jobinvolvement'

In [None]:
df['jobinvolvement'].value_counts()

In [None]:
# sustituimos los valores numéricos por texto para una mejor comprensión
jobinvolvement_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['jobinvolvement'] = df['jobinvolvement'].replace(jobinvolvement_map)
print(df['jobinvolvement'].value_counts())

In [None]:
df['jobinvolvement'] = (
    df['jobinvolvement']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobinvolvement'].value_counts())

### 'joblevel'

In [None]:
df['joblevel'].value_counts()

In [None]:
# comprobamos la media de dailyrate por joblevel, para establecer una relación entre el nivel de trabajo y la remuneración
df.groupby('joblevel')['dailyrate'].mean()


In [None]:
joblevel_map = {
    1: 'Entry Level',
    2: 'Intermediate',
    3: 'Senior',
    4: 'Manager',
    5: 'Executive'
}
df['joblevel'] = df['joblevel'].replace(joblevel_map)
print(df['joblevel'].value_counts())

In [None]:
df['joblevel'] = (
    df['joblevel']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['joblevel'].value_counts())

### 'jobrole'

In [None]:
df['jobrole'].value_counts()

In [None]:
df['jobrole'] = df['jobrole'].str.strip().str.lower().str.title()
df['jobrole'].value_counts()

In [None]:
df['jobrole'] = (
    df['jobrole']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobrole'].value_counts())

### 'jobsatisfaction'

In [None]:
df['jobsatisfaction'].value_counts()

In [None]:
satisfaction_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['jobsatisfaction'] = df['jobsatisfaction'].replace(satisfaction_map)
print(df['jobsatisfaction'].value_counts())

In [None]:
df['jobsatisfaction'] = (
    df['jobsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobsatisfaction'].value_counts())

### 'maritalstatus'

In [None]:
df['maritalstatus'].value_counts()

In [None]:
df['maritalstatus'] = df['maritalstatus'].str.strip().str.lower()
print(df['maritalstatus'].value_counts())

### 'monthlyincome'

In [None]:
df['monthlyincome'].value_counts()

In [None]:
# transformamos la columna 'monthlyincome' para que sea numérica, eliminando el símbolo de dólar y la coma
df['monthlyincome'] = df['monthlyincome'].str.replace('$', '', regex=False)
df['monthlyincome'] = df['monthlyincome'].str.replace(',', '.', regex=False)
df['monthlyincome'] = pd.to_numeric(df['monthlyincome'], errors='coerce')

In [None]:
print(df['monthlyincome'].dtype)
df['monthlyincome'].describe()
df['monthlyincome'].value_counts()

### 'monthlyrate'

In [None]:
df['monthlyrate'].value_counts()

In [None]:
# pasamos a str para corregir símbolos y puntos, luego convertimos a numérico
df['monthlyrate'] = df['monthlyrate'].astype(str)
df['monthlyrate'] = df['monthlyrate'].str.replace('$', '', regex=False)
df['monthlyrate'] = df['monthlyrate'].str.replace(',', '.', regex=False)
df['monthlyrate'] = pd.to_numeric(df['monthlyrate'], errors='coerce')
print(df['monthlyrate'].dtype)

In [None]:
df['monthlyrate'].value_counts()

### 'numcompaniesworked'

In [None]:
df['numcompaniesworked'].value_counts()

### 'overtime'

In [None]:
df['overtime'].value_counts()

In [None]:
df['overtime'] = df['overtime'].str.strip().str.lower()
print(df['overtime'].value_counts())

### 'percentsalaryhike'

In [None]:
# porcentaje de aumento salarial
df['percentsalaryhike'].value_counts()

### 'performancerating'

In [None]:
df['performancerating'].value_counts()

In [None]:
# puede servir para descartar una hipótesis, por ejemplo: "¿El desempeño influye en que una persona se vaya o esté satisfecha?”

### 'standardhours'

In [None]:
df['standardhours'].value_counts()

In [None]:
df['standardhours'] = (
    df['standardhours']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['standardhours'].value_counts())

### 'relationshipsatisfaction'

In [None]:
rel_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}
df['relationshipsatisfaction'] = df['relationshipsatisfaction'].replace(rel_map)
print(df['relationshipsatisfaction'].value_counts())

In [None]:
df['relationshipsatisfaction'] = (
    df['relationshipsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['relationshipsatisfaction'].value_counts())

### 'stockoptionlevel'

In [None]:
df[ 'stockoptionlevel'].value_counts()

In [None]:
# Esta variable indica el nivel de opciones sobre acciones que tiene asignadas cada persona. Es decir, si reciben parte de su compensación en acciones de la empresa. Los niveles más altos suelen asignarse a empleados de mayor nivel o con mejor rendimiento.
# 0: No stock options / 1-3: Diferentes niveles de acciones asignadas

### 'totalworkingyears'

In [None]:
df['totalworkingyears'].value_counts()

In [None]:
df['totalworkingyears'].isna().sum()

In [None]:
df['totalworkingyears'] = (
    df['totalworkingyears']
    .astype(str)
    .replace('<NA>', np.nan)                   
    .str.replace(',', '.', regex=False)        
    .astype(float)
    .astype('Int64')                           
)
print(df['totalworkingyears'].dtype)
print(df['totalworkingyears'].value_counts())

### 'trainingtimeslastyear'

In [None]:
df['trainingtimeslastyear'].value_counts()

### 'worklifebalance'

In [None]:
df['worklifebalance'].value_counts()

In [None]:
df['worklifebalance'].isna().sum()

In [None]:
df['worklifebalance'] = df['worklifebalance'].str.replace(',', '.').astype(float)

df['worklifebalance'] = df['worklifebalance'].replace({
    1.0: 'very_low',
    2.0: 'low',
    3.0: 'good',
    4.0: 'excellent'
})
print(df['worklifebalance'].value_counts())

### 'yearsatcompany'

In [None]:
df['yearsatcompany'].value_counts()

### 'yearssincelastpromotion'

In [None]:
df['yearssincelastpromotion'].value_counts()

### 'yearswithcurrmanager'

In [None]:
df['yearswithcurrmanager'].value_counts()

### 'sameasmonthlyincome'

In [None]:
df['sameasmonthlyincome'].value_counts()

In [None]:
# eliminamos porque es una copia de 'monthlyincome'
df.drop(columns='sameasmonthlyincome', inplace=True)

### 'datebirth'

In [None]:
df['datebirth'].value_counts()

In [None]:
df['datebirth'].describe()

### 'salary'

In [None]:
df['salary'].value_counts()

In [None]:
# nos aseguramos de que sea tipo texto
df['salary'] = df['salary'].astype(str)

# eliminamos el símbolo $ y reemplazamos la coma por punto
df['salary'] = df['salary'].str.replace('$', '', regex=False)
df['salary'] = df['salary'].str.replace(',', '.', regex=False)

# convertimos a número decimal (float)
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
print(df['salary'].dtype)
print(df['salary'].value_counts())


### 'roledepartament'

In [None]:
df['roledepartament'].value_counts()

In [None]:
# ya tenemos las columnas 'jobrole' y 'department', así que eliminamos esta columna, que tiene un alto porcentaje de nulos
# df.drop(columns='roledepartament', inplace=True)

### 'remotework'

In [None]:
df['remotework'].value_counts()

In [None]:
# Convertimos todo a string para estandarizar
df['remotework'] = df['remotework'].astype(str).str.strip().str.lower()

# Reemplazamos todas las variantes por 'yes' o 'no'
df['remotework'] = df['remotework'].replace({
    '1': 'yes',
    'true': 'yes',
    'yes': 'yes',
    '0': 'no',
    'false': 'no'
})
print(df['remotework'].value_counts())

In [None]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df.columns

In [None]:
df = df.rename(columns={'businesstravel': 'business_travel', 
                        'dailyrate': 'daily_rate',
                        'distancefromhome': 'distance_from_home', 
                        'educationfield': 'education_field',
                        'employeecount': 'employee_count',
                        'employeenumber': 'employee_number',
                        'environmentsatisfaction': 'environment_satisfaction',
                        'hourlyrate': 'hourly_rate',
                        'jobinvolvement': 'job_involvement',
                        'joblevel': 'job_level',
                        'jobrole': 'job_role',
                        'jobsatisfaction': 'job_satisfaction',
                        'maritalstatus': 'marital_status',
                        'monthlyincome': 'monthly_income',
                        'monthlyrate': 'monthly_rate',
                        'numcompaniesworked': 'num_companies_worked',
                        'percentsalaryhike': 'percent_salary_hike',
                        'performancerating': 'performance_rating',
                        'relationshipsatisfaction': 'relationship_satisfaction',
                        'standardhours': 'standard_hours',
                        'stockoptionlevel': 'stock_option_level',
                        'totalworkingyears': 'total_working_years',
                        'trainingtimeslastyear': 'training_times_last_year',
                        'worklifebalance': 'work_life_balance',
                        'yearsatcompany': 'years_at_company',
                        'yearsincurrentrole': 'yearsincurrentrole',
                        'yearssincelastpromotion': 'years_since_last_promotion',
                        'yearswithcurrmanager': 'years_with_curr_manager',
                        'datebirth': 'date_birth', 
                        'roledepartament': 'role_departament',
                        'remotework': 'remote_work'})

In [None]:
df.columns

In [None]:
nulls(df,1,1)

In [None]:
df['department'].unique()

In [None]:
df.columns