In [58]:
# Importamos las librerías que necesitamos

# Tratamiento de datos
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Ocultamos las advertencias para evitar mensajes innecesarios
import warnings
warnings.filterwarnings("ignore")

# Configuración
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [59]:
# Lo primero que hacemos es cargar el dataframe que vamos a usar
df = pd.read_csv("spaces_hr_raw_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30.0,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,,1971,"199990,00$",,,1
2,2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220.0,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1
3,3,47,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,"14307,50$","37199,50$",3,Y,,19,30,2,Full Time,2,,2,,20,,5,6,"14307,50$",1976,"171690,00$",,,False
4,4,46,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,"12783,92$","33238,20$",2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,"12783,92$",1977,,,,0


In [60]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'over18', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager',
       'sameasmonthlyincome', 'datebirth', 'salary', 'roledepartament',
       'numberchildren', 'remotework'],
      dtype='object')

In [61]:
df = df.drop(columns=['numberchildren', 'over18', 'yearsincurrentrole'])

In [62]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'remotework'],
      dtype='object')

In [63]:
def exploracion_basica(df):
    print("=== EXPLORACIÓN BÁSICA DEL DATASET ===")
    print("\nPrimeras filas:")
    print(df.head())
    print("\nÚltimas filas:")
    print(df.tail())
    print("\nNúmero de filas y columnas:")
    print(df.shape)
    print("\nInformación general del DataFrame:")
    print(df.info())
    print("\nEstadísticas descriptivas:")
    print(df.describe())
    print("\nColumnas:")
    print(list(df.columns))

In [64]:
exploracion_basica(df)

=== EXPLORACIÓN BÁSICA DEL DATASET ===

Primeras filas:
   Unnamed: 0 age attrition businesstravel    dailyrate  \
0           0  51        No            NaN  2015.722222   
1           1  52        No            NaN  2063.388889   
2           2  42        No  travel_rarely  1984.253968   
3           3  47        No  travel_rarely  1771.404762   
4           4  46        No            NaN  1582.771346   

                 department  distancefromhome  education    educationfield  \
0                       NaN                 6          3               NaN   
1                       NaN                 1          4     Life Sciences   
2   Research & Development                  4          2  Technical Degree   
3                       NaN                 2          4           Medical   
4                       NaN                 3          3  Technical Degree   

   employeecount  employeenumber  environmentsatisfaction  gender  hourlyrate  \
0              1               1       

In [65]:
# Variables con valores nulos
def nulls(df,count=0,share=0):
    
    nulls = df.isnull().sum()
    nulls_share = df.isnull().sum()/df.shape[0]*100
    with_nulls_share = nulls_share[nulls_share > 0]
    
    if count == 1:
        print('Count of nulls')
        with_nulls = nulls[nulls > 0]
        print (with_nulls.sort_values(ascending=False))
        print('-----------------------------------')
    
    if share == 1:
        print('share of nulls')
        print (with_nulls_share.sort_values(ascending=False))
    
    nulls_list = with_nulls_share.to_frame(name='perc_nulos').reset_index().rename(columns={'index': 'var'})
    
    return nulls_list

In [66]:
nulls(df,1,1)

Count of nulls
department             1366
roledepartament        1366
hourlyrate             1267
businesstravel          801
educationfield          774
overtime                696
maritalstatus           675
totalworkingyears       549
monthlyincome           489
sameasmonthlyincome     489
standardhours           351
salary                  285
performancerating       200
worklifebalance         114
dtype: int64
-----------------------------------
share of nulls
department             81.406436
roledepartament        81.406436
hourlyrate             75.506555
businesstravel         47.735399
educationfield         46.126341
overtime               41.477950
maritalstatus          40.226460
totalworkingyears      32.717521
monthlyincome          29.141836
sameasmonthlyincome    29.141836
standardhours          20.917759
salary                 16.984505
performancerating      11.918951
worklifebalance         6.793802
dtype: float64


Unnamed: 0,var,perc_nulos
0,businesstravel,47.735399
1,department,81.406436
2,educationfield,46.126341
3,hourlyrate,75.506555
4,maritalstatus,40.22646
5,monthlyincome,29.141836
6,overtime,41.47795
7,performancerating,11.918951
8,standardhours,20.917759
9,totalworkingyears,32.717521


In [67]:
def detectar_duplicados(df):
    numero_duplicados = df.duplicated().sum()
    print(f"Filas duplicadas completas en el dataset: {numero_duplicados}")
    if numero_duplicados > 0:
        print("Aquí tienes las filas duplicadas:")
        print(df[df.duplicated()])

In [68]:
detectar_duplicados(df)

Filas duplicadas completas en el dataset: 0


### Exploramos columna por columna

In [69]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'remotework'],
      dtype='object')

In [70]:
df.drop(columns='Unnamed: 0', inplace=True)

### 'age'

In [71]:
df['age'].value_counts()

age
35              88
31              88
34              86
29              82
36              79
32              66
30              65
38              64
33              63
40              60
28              54
37              54
27              54
45              50
42              50
41              48
39              46
26              46
43              42
46              38
44              34
50              34
47              30
24              29
25              28
49              26
55              25
51              22
48              22
53              20
54              20
52              18
56              18
21              17
22              17
58              16
23              15
59              11
20              11
18               9
19               9
57               6
60               5
thirty-two       2
twenty-four      2
thirty           1
fifty-eight      1
fifty-two        1
twenty-six       1
fifty-five       1
thirty-seven     1
thirty-six       1
forty-se

In [56]:
# transformamos las edades escritas en palabras a números
words_to_numbers = {
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'twenty-one': 21,
    'twenty-two': 22,
    'twenty-three': 23,
    'twenty-four': 24,
    'twenty-five': 25,
    'twenty-six': 26,
    'twenty-seven': 27,
    'twenty-eight': 28,
    'twenty-nine': 29,
    'thirty': 30,
    'thirty-one': 31,
    'thirty-two': 32,
    'thirty-three': 33,
    'thirty-four': 34,
    'thirty-five': 35,
    'thirty-six': 36,
    'thirty-seven': 37,
    'forty-seven': 47,
    'fifty-two': 52,
    'fifty-five': 55,
    'fifty-eight': 58
}


In [57]:
# reemplazamos palabras por números, y convertimos la columna a tipo numérico
df['age'] = df['age'].replace(words_to_numbers)
df['age'] = pd.to_numeric(df['age'], errors='coerce')

print(df['age'].dtype)
print(df['age'].value_counts().sort_index())

int64
age
18     9
19     9
20    11
21    17
22    17
23    15
24    31
25    28
26    47
27    54
28    54
29    82
30    66
31    89
32    68
33    63
34    86
35    88
36    80
37    55
38    64
39    46
40    60
41    48
42    50
43    42
44    34
45    50
46    38
47    31
48    22
49    26
50    34
51    22
52    19
53    20
54    20
55    26
56    18
57     6
58    17
59    11
60     5
Name: count, dtype: int64


### 'attrition'

In [18]:
df['attrition'].value_counts()

attrition
No     1406
Yes     272
Name: count, dtype: int64

In [19]:
df['attrition'] = df['attrition'].str.lower()
print(df['attrition'].value_counts())

attrition
no     1406
yes     272
Name: count, dtype: int64


### 'businesstravel'

In [20]:
df['businesstravel'].value_counts()

businesstravel
travel_rarely        616
travel_frequently    168
non-travel            93
Name: count, dtype: int64

In [21]:
df['businesstravel'] = df['businesstravel'].str.replace(' ', '_', regex=False)
df['businesstravel'] = df['businesstravel'].str.replace('-', '_', regex=False).str.lower()
print(df['businesstravel'].value_counts())

businesstravel
travel_rarely        616
travel_frequently    168
non_travel            93
Name: count, dtype: int64


### 'dailyrate'

In [22]:
df['dailyrate'].value_counts()

dailyrate
556.256661     326
290.035510     308
1032.487286    150
1582.771346     55
1973.984127     38
              ... 
320.047619       1
470.063492       1
436.428571       1
501.325397       1
2004.785714      1
Name: count, Length: 673, dtype: int64

In [23]:
# reducimos a dos decimales los valores de la columna 'dailyrate'
df['dailyrate'] = df['dailyrate'].round(2)
print(df['dailyrate'].value_counts())

dailyrate
556.26     326
290.04     308
1032.49    150
1582.77     55
1973.98     38
          ... 
320.05       1
470.06       1
436.43       1
501.33       1
2004.79      1
Name: count, Length: 673, dtype: int64


### 'department'

In [24]:
df['department'].value_counts()

department
Research & Development     203
Sales                       93
Human Resources             16
Name: count, dtype: int64

In [25]:
df['department'].unique()

array([nan, ' Research & Development ', ' Sales ', ' Human Resources '],
      dtype=object)

In [26]:
df['department'] = df['department'].apply(
    lambda x: x.strip().replace(' ', '_').lower() if pd.notna(x) else x
)

print(df['department'].value_counts())

department
research_&_development    203
sales                      93
human_resources            16
Name: count, dtype: int64


In [27]:
df['department'].isnull().sum()

1366

In [28]:
df['department'].unique()

array([nan, 'research_&_development', 'sales', 'human_resources'],
      dtype=object)

### 'distancefromhome'

In [29]:
df['distancefromhome'].value_counts()

distancefromhome
 2     228
 1     209
 9      89
 10     86
 8      85
      ... 
-21      2
-34      2
-43      2
-39      1
-40      1
Name: count, Length: 69, dtype: int64

In [30]:
# pasamos la distancia a positivos, ya que hay valores negativos
df['distancefromhome'] = df['distancefromhome'].abs()
print(df['distancefromhome'].value_counts())

distancefromhome
2     228
1     209
10     92
9      89
8      85
7      82
3      79
5      65
4      64
6      63
24     37
16     37
12     33
13     33
11     33
18     32
15     31
25     31
26     30
29     30
23     30
14     28
19     23
20     23
17     22
21     22
22     21
28     20
27     18
47      8
35      8
38      7
31      6
36      6
42      6
45      6
48      5
37      5
46      5
32      4
41      4
30      4
33      3
44      3
34      2
49      2
43      2
39      1
40      1
Name: count, dtype: int64


### 'education'

In [31]:
df['education'].value_counts()

education
3    649
4    461
2    322
1    186
5     60
Name: count, dtype: int64

### 'educationfield'

In [32]:
df['educationfield'].value_counts()

educationfield
Life Sciences       367
Medical             286
Marketing           106
Technical Degree     70
Other                63
Human Resources      12
Name: count, dtype: int64

In [33]:
df['educationfield'] = (
    df['educationfield']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['educationfield'].value_counts())

educationfield
nan                 774
life_sciences       367
medical             286
marketing           106
technical_degree     70
other                63
human_resources      12
Name: count, dtype: int64


### 'employeecount'

In [34]:
df['employeecount'].value_counts()

employeecount
1    1678
Name: count, dtype: int64

In [35]:
# no aporta información útil, la eliminamos
df = df.drop(columns='employeecount')

### 'employeenumber'

In [36]:
df['employeenumber'].value_counts()

employeenumber
300     2
159     2
271     2
1158    2
191     2
       ..
547     1
546     1
545     1
544     1
1614    1
Name: count, Length: 1614, dtype: int64

In [37]:
df[df.duplicated(subset='employeenumber', keep=False)].sort_values('employeenumber')


Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,remotework
8,41,no,,1712.18,,2,5,,9,2,1,,3,4,mANAGEr,1,Married,"13829,17$","35955,83$",7,No,16,30,2,Full Time,1,220,2,30,18,11,8,"13829,17$",1982,"165950,00$",,True
1656,41,no,,1712.18,,2,5,,9,2,1,,3,4,mANAGEr,1,Married,"13829,17$","35955,83$",7,No,16,30,2,Full Time,1,220,2,30,18,11,8,"13829,17$",1982,"165950,00$",,True
60,36,no,,610.17,,5,2,,61,4,0,,3,2,lAboratORy TeChNiCiaN,2,Single,"4928,33$","12813,67$",8,No,16,30,4,Full Time,0,160,3,40,13,3,7,"4928,33$",1987,"59140,00$",,1
1652,36,no,,610.17,,5,2,,61,4,0,,3,2,lAboratORy TeChNiCiaN,2,Single,"4928,33$","12813,67$",8,No,16,30,4,Full Time,0,160,3,40,13,3,7,"4928,33$",1987,"59140,00$",,1
1676,47,no,travel_rarely,1032.49,,4,3,life_sciences,76,3,1,,2,3,maNufACTURING DIREctOr,2,Divorced,"8339,32$","21682,23$",8,Yes,12,,3,Part Time,1,,4,30,22,14,10,"8339,32$",1976,"100071,84$",,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,48,no,travel_rarely,417.96,,2,5,,1568,2,1,,3,2,sALES EXEcuTIVe,4,Married,"3375,83$","8777,17$",2,No,14,30,1,Part Time,1,140,2,30,9,6,7,"3375,83$",1975,"40510,00$",,True
1657,45,no,travel_rarely,495.75,,2,3,other,1569,4,0,,3,2,sAles executiVe,2,,,"10410,83$",0,No,19,30,2,Part Time,1,,3,40,8,3,7,,1978,"48050,00$",,False
1568,45,no,travel_rarely,495.75,,2,3,other,1569,4,0,,3,2,sAles executiVe,2,,,"10410,83$",0,No,19,30,2,Part Time,1,,3,40,8,3,7,,1978,"48050,00$",,False
1671,31,no,,556.26,,12,3,medical,1608,4,1,,3,2,HEaltHCarE REPreSENtAtIve,4,,,"11681,39$",0,,11,30,3,Part Time,2,100,2,10,9,8,5,,1992,"53914,11$",,True


In [38]:
df = df.drop_duplicates()

In [39]:
df['employeenumber'].is_unique

True

In [40]:
df[df.duplicated(subset='employeenumber', keep=False)].sort_values('employeenumber')

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,remotework


In [41]:
print(f"Número de employeenumber duplicados: {df.duplicated(subset='employeenumber').sum()}")

Número de employeenumber duplicados: 0


### 'environmentsatisfaction'

In [176]:
df['environmentsatisfaction'].value_counts()

environmentsatisfaction
4     460
3     459
1     298
2     297
12      7
35      6
13      6
14      5
24      5
47      5
36      4
48      4
41      4
46      4
42      4
45      3
11      3
22      3
17      3
18      3
20      3
25      3
27      3
37      2
29      2
19      2
38      2
31      2
15      2
16      2
26      1
39      1
10      1
49      1
21      1
28      1
33      1
43      1
Name: count, dtype: int64

In [177]:
# detectamos valores no válidos, deberían ser del 1 al 4

df['environmentsatisfaction'] = df['environmentsatisfaction'].apply(
    lambda x: np.nan if x>4 else x
)

print(df['environmentsatisfaction'].value_counts())
# puede ser que acá esté el problema - cambiamos los que no están en el rango por nan, 
# no los sacamos de la muestra 

environmentsatisfaction
4.0    460
3.0    459
1.0    298
2.0    297
Name: count, dtype: int64


In [178]:
df['department'].isnull().sum()

1312

In [179]:
# sustituimos los valores numéricos por texto para una mejor comprensión
satisfaction_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['environmentsatisfaction'] = df['environmentsatisfaction'].replace(satisfaction_map)
print(df['environmentsatisfaction'].value_counts())

environmentsatisfaction
Very High    460
High         459
Low          298
Medium       297
Name: count, dtype: int64


In [180]:
df['environmentsatisfaction'] = (
    df['environmentsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['environmentsatisfaction'].value_counts())

environmentsatisfaction
very_high    460
high         459
low          298
medium       297
nan          100
Name: count, dtype: int64


### 'gender'

In [181]:
df['gender'].value_counts()

gender
0    971
1    643
Name: count, dtype: int64

In [182]:
# típicamente, 0 es femenino y 1 es masculino. Aún así, comprobamos con la media de dailyrate, que suele ser más alta en hombres que en mujeres
df.groupby('gender')['dailyrate'].mean()


gender
0    663.301771
1    679.191711
Name: dailyrate, dtype: float64

In [183]:
gender_map = {
    0: 'Female',
    1: 'Male'
}

df['gender'] = df['gender'].replace(gender_map)
df['gender'].value_counts()

gender
Female    971
Male      643
Name: count, dtype: int64

In [184]:
df['gender'] = df['gender'].astype(str).str.lower()
print(df['gender'].value_counts())

gender
female    971
male      643
Name: count, dtype: int64


### 'hourlyrate'

In [185]:
df['hourlyrate'].value_counts()

hourlyrate
36.254439     75
69.532083     74
129.060911    38
197.846418    14
246.748016     8
              ..
59.957341      1
28.205357      1
54.914683      1
227.487103     1
133.159722     1
Name: count, Length: 194, dtype: int64

In [186]:
df['hourlyrate'] = df['hourlyrate'].round(2)

### 'jobinvolvement'

In [187]:
df['jobinvolvement'].value_counts()

jobinvolvement
3    955
2    406
4    164
1     89
Name: count, dtype: int64

In [188]:
# sustituimos los valores numéricos por texto para una mejor comprensión
jobinvolvement_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['jobinvolvement'] = df['jobinvolvement'].replace(jobinvolvement_map)
print(df['jobinvolvement'].value_counts())

jobinvolvement
High         955
Medium       406
Very High    164
Low           89
Name: count, dtype: int64


In [189]:
df['jobinvolvement'] = (
    df['jobinvolvement']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobinvolvement'].value_counts())

jobinvolvement
high         955
medium       406
very_high    164
low           89
Name: count, dtype: int64


### 'joblevel'

In [190]:
df['joblevel'].value_counts()

joblevel
2    597
1    586
3    242
4    113
5     76
Name: count, dtype: int64

In [191]:
# comprobamos la media de dailyrate por joblevel, para establecer una relación entre el nivel de trabajo y la remuneración
df.groupby('joblevel')['dailyrate'].mean()

joblevel
1     290.037526
2     556.258325
3    1032.489174
4    1582.770354
5    1973.982237
Name: dailyrate, dtype: float64

In [192]:
joblevel_map = {
    1: 'Entry Level',
    2: 'Intermediate',
    3: 'Senior',
    4: 'Manager',
    5: 'Executive'
}
df['joblevel'] = df['joblevel'].replace(joblevel_map)
print(df['joblevel'].value_counts())

joblevel
Intermediate    597
Entry Level     586
Senior          242
Manager         113
Executive        76
Name: count, dtype: int64


In [193]:
df['joblevel'] = (
    df['joblevel']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['joblevel'].value_counts())

joblevel
intermediate    597
entry_level     586
senior          242
manager         113
executive        76
Name: count, dtype: int64


### 'jobrole'

In [194]:
df['jobrole'].value_counts()

jobrole
mANager                       4
ManageR                       3
ManagEr                       3
mAnaGeR                       3
MANAgER                       3
                             ..
ResEArch ScieNTiST            1
HealthcARE RePreSENtAtiVe     1
ReSearcH scIEntist            1
LAbOrATOry techNicIan         1
mAnUfactURInG DiRECTOr        1
Name: count, Length: 1579, dtype: int64

In [195]:
df['jobrole'] = df['jobrole'].str.strip().str.lower().str.title()
df['jobrole'].value_counts()

jobrole
Sales Executive              369
Research Scientist           314
Laboratory Technician        278
Manufacturing Director       158
Healthcare Representative    149
Manager                      111
Sales Representative          90
Research Director             88
Human Resources               57
Name: count, dtype: int64

In [196]:
df['jobrole'] = (
    df['jobrole']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobrole'].value_counts())

jobrole
sales_executive              369
research_scientist           314
laboratory_technician        278
manufacturing_director       158
healthcare_representative    149
manager                      111
sales_representative          90
research_director             88
human_resources               57
Name: count, dtype: int64


### 'jobsatisfaction'

In [197]:
df['jobsatisfaction'].value_counts()

jobsatisfaction
4    514
3    481
1    317
2    302
Name: count, dtype: int64

In [198]:
satisfaction_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

df['jobsatisfaction'] = df['jobsatisfaction'].replace(satisfaction_map)
print(df['jobsatisfaction'].value_counts())

jobsatisfaction
Very High    514
High         481
Low          317
Medium       302
Name: count, dtype: int64


In [199]:
df['jobsatisfaction'] = (
    df['jobsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['jobsatisfaction'].value_counts())

jobsatisfaction
very_high    514
high         481
low          317
medium       302
Name: count, dtype: int64


### 'maritalstatus'

In [201]:
df['maritalstatus'].value_counts()

maritalstatus
Married     404
Single      325
Divorced    188
Marreid      35
divorced     11
Name: count, dtype: int64

In [203]:
df['maritalstatus'] = df['maritalstatus'].str.strip().str.lower()
df['maritalstatus'] = df['maritalstatus'].replace({'marreid': 'married'})
print(df['maritalstatus'].value_counts())

maritalstatus
married     439
single      325
divorced    199
Name: count, dtype: int64


### 'monthlyincome'

In [204]:
df['monthlyincome'].value_counts()

monthlyincome
4492,84$     224
2342,59$     219
8339,32$      98
12783,92$     42
15943,72$     24
            ... 
5415,83$       1
4994,17$       1
3806,67$       1
13920,00$      1
3705,83$       1
Name: count, Length: 493, dtype: int64

In [205]:
# transformamos la columna 'monthlyincome' para que sea numérica, eliminando el símbolo de dólar y la coma
df['monthlyincome'] = df['monthlyincome'].str.replace('$', '', regex=False)
df['monthlyincome'] = df['monthlyincome'].str.replace(',', '.', regex=False)
df['monthlyincome'] = pd.to_numeric(df['monthlyincome'], errors='coerce')

In [206]:
print(df['monthlyincome'].dtype)
df['monthlyincome'].describe()
df['monthlyincome'].value_counts()

float64


monthlyincome
4492.84     224
2342.59     219
8339.32      98
12783.92     42
15943.72     24
           ... 
5415.83       1
4994.17       1
3806.67       1
13920.00      1
3705.83       1
Name: count, Length: 493, dtype: int64

### 'monthlyrate'

In [207]:
df['monthlyrate'].value_counts()

monthlyrate
11681,39$    317
6090,75$     295
21682,23$    139
33238,20$     55
41453,67$     37
            ... 
6721,00$       1
9871,33$       1
9165,00$       1
10527,83$      1
42100,50$      1
Name: count, Length: 673, dtype: int64

In [208]:
# pasamos a str para corregir símbolos y puntos, luego convertimos a numérico
df['monthlyrate'] = df['monthlyrate'].astype(str)
df['monthlyrate'] = df['monthlyrate'].str.replace('$', '', regex=False)
df['monthlyrate'] = df['monthlyrate'].str.replace(',', '.', regex=False)
df['monthlyrate'] = pd.to_numeric(df['monthlyrate'], errors='coerce')
print(df['monthlyrate'].dtype)

float64


In [209]:
df['monthlyrate'].value_counts()

monthlyrate
11681.39    317
6090.75     295
21682.23    139
33238.20     55
41453.67     37
           ... 
6721.00       1
9871.33       1
9165.00       1
10527.83      1
42100.50      1
Name: count, Length: 673, dtype: int64

### 'numcompaniesworked'

In [210]:
df['numcompaniesworked'].value_counts()

numcompaniesworked
1    573
0    226
3    169
4    157
2    156
7     84
6     73
5     66
9     59
8     51
Name: count, dtype: int64

### 'overtime'

In [211]:
df['overtime'].value_counts()

overtime
No     682
Yes    256
Name: count, dtype: int64

In [212]:
df['overtime'] = df['overtime'].str.strip().str.lower()
print(df['overtime'].value_counts())

overtime
no     682
yes    256
Name: count, dtype: int64


### 'percentsalaryhike'

In [213]:
# porcentaje de aumento salarial
df['percentsalaryhike'].value_counts()

percentsalaryhike
11    232
13    230
12    225
14    220
15    110
18     98
17     88
16     86
19     82
20     60
22     59
21     51
23     29
24     25
25     19
Name: count, dtype: int64

### 'performancerating'

In [214]:
df['performancerating'].value_counts()

performancerating
3,0    1205
4,0     214
Name: count, dtype: int64

### 'standardhours'

In [216]:
df['standardhours'].value_counts()

standardhours
Part Time    888
Full Time    388
Name: count, dtype: int64

In [217]:
df['standardhours'] = (
    df['standardhours']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['standardhours'].value_counts())

standardhours
part_time    888
full_time    388
nan          338
Name: count, dtype: int64


### 'relationshipsatisfaction'

In [218]:
rel_map = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}
df['relationshipsatisfaction'] = df['relationshipsatisfaction'].replace(rel_map)
print(df['relationshipsatisfaction'].value_counts())

relationshipsatisfaction
High         504
Very High    468
Medium       339
Low          303
Name: count, dtype: int64


In [219]:
df['relationshipsatisfaction'] = (
    df['relationshipsatisfaction']
    .astype(str)                      
    .str.strip('_')                   
    .str.replace('_', ' ', regex=False)  
    .str.strip()                      
    .str.lower()                     
    .str.replace(' ', '_', regex=False)  
)
print(df['relationshipsatisfaction'].value_counts())

relationshipsatisfaction
high         504
very_high    468
medium       339
low          303
Name: count, dtype: int64


### 'stockoptionlevel'

In [220]:
df[ 'stockoptionlevel'].value_counts()

stockoptionlevel
0    687
1    666
2    172
3     89
Name: count, dtype: int64

In [221]:
# Esta variable indica el nivel de opciones sobre acciones que tiene asignadas cada persona. Es decir, si reciben parte de su compensación en acciones de la empresa. Los niveles más altos suelen asignarse a empleados de mayor nivel o con mejor rendimiento.
# 0: No stock options / 1-3: Diferentes niveles de acciones asignadas

### 'totalworkingyears'

In [222]:
df['totalworkingyears'].value_counts()

totalworkingyears
10,0    144
8,0      86
6,0      84
9,0      69
5,0      66
7,0      56
4,0      54
1,0      53
12,0     34
3,0      32
14,0     30
13,0     30
11,0     29
15,0     28
16,0     28
20,0     28
18,0     27
21,0     23
17,0     22
2,0      21
22,0     18
19,0     17
24,0     14
23,0     13
28,0     13
26,0      8
0,0       8
29,0      6
36,0      6
25,0      6
33,0      6
37,0      5
27,0      5
31,0      4
30,0      3
32,0      3
35,0      3
40,0      3
34,0      2
38,0      1
Name: count, dtype: int64

In [223]:
df['totalworkingyears'].isna().sum()

526

In [224]:
df['totalworkingyears'] = (
    df['totalworkingyears']
    .astype(str)
    .replace('<NA>', np.nan)                   
    .str.replace(',', '.', regex=False)        
    .astype(float)
    .astype('Int64')                           
)
print(df['totalworkingyears'].dtype)
print(df['totalworkingyears'].value_counts())

Int64
totalworkingyears
10    144
8      86
6      84
9      69
5      66
7      56
4      54
1      53
12     34
3      32
14     30
13     30
11     29
15     28
16     28
20     28
18     27
21     23
17     22
2      21
22     18
19     17
24     14
23     13
28     13
26      8
0       8
29      6
36      6
25      6
33      6
37      5
27      5
31      4
30      3
32      3
35      3
40      3
34      2
38      1
Name: count, dtype: Int64


### 'trainingtimeslastyear'

In [225]:
df['trainingtimeslastyear'].value_counts()

trainingtimeslastyear
2    598
3    534
4    137
5    136
1     77
6     72
0     60
Name: count, dtype: int64

### 'worklifebalance'

In [226]:
df['worklifebalance'].value_counts()

worklifebalance
3,0    913
2,0    359
4,0    155
1,0     79
Name: count, dtype: int64

In [227]:
df['worklifebalance'].isna().sum()

108

In [228]:
df['worklifebalance'] = df['worklifebalance'].str.replace(',', '.').astype(float)

df['worklifebalance'] = df['worklifebalance'].replace({
    1.0: 'very_low',
    2.0: 'low',
    3.0: 'good',
    4.0: 'excellent'
})
print(df['worklifebalance'].value_counts())

worklifebalance
good         913
low          359
excellent    155
very_low      79
Name: count, dtype: int64


### 'yearsatcompany'

In [229]:
df['yearsatcompany'].value_counts()

yearsatcompany
5     208
1     171
3     141
2     141
10    133
7     115
4     114
8     106
9      94
6      78
0      44
11     36
20     29
13     26
15     21
14     19
22     17
12     15
18     15
16     14
21     14
19     12
17      9
24      7
25      5
33      5
26      4
32      3
27      3
31      3
36      3
29      2
23      2
40      2
34      1
37      1
30      1
Name: count, dtype: int64

### 'yearssincelastpromotion'

In [230]:
df['yearssincelastpromotion'].value_counts()

yearssincelastpromotion
0     625
1     384
2     177
7      93
4      67
3      62
5      53
6      37
11     26
8      20
9      18
15     15
12     11
13     10
14     10
10      6
Name: count, dtype: int64

### 'yearswithcurrmanager'

In [231]:
df['yearswithcurrmanager'].value_counts()

yearswithcurrmanager
2     380
0     270
7     267
3     148
8     115
4     104
1      84
9      70
5      36
10     31
6      30
11     22
12     20
13     16
17      8
15      5
14      5
16      3
Name: count, dtype: int64

### 'sameasmonthlyincome'

In [232]:
df['sameasmonthlyincome'].value_counts()

sameasmonthlyincome
4492,84$     224
2342,59$     219
8339,32$      98
12783,92$     42
15943,72$     24
            ... 
5415,83$       1
4994,17$       1
3806,67$       1
13920,00$      1
3705,83$       1
Name: count, Length: 493, dtype: int64

In [233]:
# eliminamos porque es una copia de 'monthlyincome'
df.drop(columns='sameasmonthlyincome', inplace=True)

### 'datebirth'

In [234]:
df['datebirth'].value_counts()

datebirth
1988    84
1992    83
1989    83
1994    78
1987    75
1991    65
1985    64
1993    64
1990    61
1983    60
1986    55
1996    54
1995    53
1981    50
1978    48
1982    46
1997    46
1984    45
1980    40
1977    38
1979    34
1973    31
1999    27
1976    27
1998    27
1968    25
1974    25
1969    20
1972    20
1970    20
1975    20
1971    19
1967    17
2001    16
2002    16
1965    15
2000    15
2003    11
1964    10
2004     9
2005     8
1963     5
1966     5
Name: count, dtype: int64

In [235]:
df['datebirth'].describe()

count    1614.000000
mean     1986.076208
std         9.101332
min      1963.000000
25%      1980.000000
50%      1987.000000
75%      1993.000000
max      2005.000000
Name: datebirth, dtype: float64

### 'salary'

In [236]:
df['salary'].value_counts()

salary
53914,11$     261
28111,13$     244
100071,84$    115
153407,07$     45
191324,62$     27
             ... 
42210,00$       1
129360,00$      1
53680,00$       1
31960,00$       1
42440,00$       1
Name: count, Length: 583, dtype: int64

In [237]:
# nos aseguramos de que sea tipo texto
df['salary'] = df['salary'].astype(str)

# eliminamos el símbolo $ y reemplazamos la coma por punto
df['salary'] = df['salary'].str.replace('$', '', regex=False)
df['salary'] = df['salary'].str.replace(',', '.', regex=False)

# convertimos a número decimal (float)
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
print(df['salary'].dtype)
print(df['salary'].value_counts())


float64
salary
53914.11     261
28111.13     244
100071.84    115
153407.07     45
191324.62     27
            ... 
42210.00       1
129360.00      1
53680.00       1
31960.00       1
42440.00       1
Name: count, Length: 583, dtype: int64


### 'roledepartament'

In [238]:
df['roledepartament'].value_counts()

roledepartament
MaNAgeR  -  Sales                                        2
ManaGER  -  Research & Development                       1
ReseaRch scIENTisT  -  Research & Development            1
ManufacTURInG DIRECtOR  -  Research & Development        1
hEalthCaRe reprEseNTaTiVe  -  Research & Development     1
                                                        ..
saLES eXEcUTiVE  -  Sales                                1
mANUfacTURiNG dIRectOR  -  Research & Development        1
huMAn ResOurces  -  Human Resources                      1
HUMAN ResoURCeS  -  Human Resources                      1
sAleS EXECUtIvE  -  Sales                                1
Name: count, Length: 301, dtype: int64

In [239]:
# ya tenemos las columnas 'jobrole' y 'department', así que eliminamos esta columna, que tiene un alto porcentaje de nulos
df.drop(columns='roledepartament', inplace=True)

### 'remotework'

In [240]:
df['remotework'].value_counts()

remotework
1        360
True     345
0        309
False    305
Yes      295
Name: count, dtype: int64

In [241]:
# Convertimos todo a string para estandarizar
df['remotework'] = df['remotework'].astype(str).str.strip().str.lower()

# Reemplazamos todas las variantes por 'yes' o 'no'
df['remotework'] = df['remotework'].replace({
    '1': 'yes',
    'true': 'yes',
    'yes': 'yes',
    '0': 'no',
    'false': 'no'
})
print(df['remotework'].value_counts())

remotework
yes    1000
no      614
Name: count, dtype: int64


In [242]:
df.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeenumber',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'standardhours', 'stockoptionlevel', 'totalworkingyears',
       'trainingtimeslastyear', 'worklifebalance', 'yearsatcompany',
       'yearssincelastpromotion', 'yearswithcurrmanager', 'datebirth',
       'salary', 'remotework'],
      dtype='object')

In [243]:
df = df.rename(columns={'businesstravel': 'business_travel', 
                        'dailyrate': 'daily_rate',
                        'distancefromhome': 'distance_from_home', 
                        'educationfield': 'education_field',
                        'employeecount': 'employee_count',
                        'employeenumber': 'employee_number',
                        'environmentsatisfaction': 'environment_satisfaction',
                        'hourlyrate': 'hourly_rate',
                        'jobinvolvement': 'job_involvement',
                        'joblevel': 'job_level',
                        'jobrole': 'job_role',
                        'jobsatisfaction': 'job_satisfaction',
                        'maritalstatus': 'marital_status',
                        'monthlyincome': 'monthly_income',
                        'monthlyrate': 'monthly_rate',
                        'numcompaniesworked': 'num_companies_worked',
                        'percentsalaryhike': 'percent_salary_hike',
                        'performancerating': 'performance_rating',
                        'relationshipsatisfaction': 'relationship_satisfaction',
                        'standardhours': 'standard_hours',
                        'stockoptionlevel': 'stock_option_level',
                        'totalworkingyears': 'total_working_years',
                        'trainingtimeslastyear': 'training_times_last_year',
                        'worklifebalance': 'work_life_balance',
                        'yearsatcompany': 'years_at_company',
                        'yearsincurrentrole': 'yearsincurrentrole',
                        'yearssincelastpromotion': 'years_since_last_promotion',
                        'yearswithcurrmanager': 'years_with_curr_manager',
                        'datebirth': 'date_birth', 
                        'roledepartament': 'role_departament',
                        'remotework': 'remote_work'})

In [244]:
df.columns

Index(['age', 'attrition', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_number',
       'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement',
       'job_level', 'job_role', 'job_satisfaction', 'marital_status',
       'monthly_income', 'monthly_rate', 'num_companies_worked', 'overtime',
       'percent_salary_hike', 'performance_rating',
       'relationship_satisfaction', 'standard_hours', 'stock_option_level',
       'total_working_years', 'training_times_last_year', 'work_life_balance',
       'years_at_company', 'years_since_last_promotion',
       'years_with_curr_manager', 'date_birth', 'salary', 'remote_work'],
      dtype='object')

In [245]:
nulls(df,1,1)

Count of nulls
department             1312
hourly_rate            1210
business_travel         772
overtime                676
marital_status          651
total_working_years     526
monthly_income          468
salary                  274
performance_rating      195
work_life_balance       108
dtype: int64
-----------------------------------
share of nulls
department             81.288724
hourly_rate            74.969021
business_travel        47.831475
overtime               41.883519
marital_status         40.334572
total_working_years    32.589839
monthly_income         28.996283
salary                 16.976456
performance_rating     12.081784
work_life_balance       6.691450
dtype: float64


Unnamed: 0,var,perc_nulos
0,business_travel,47.831475
1,department,81.288724
2,hourly_rate,74.969021
3,marital_status,40.334572
4,monthly_income,28.996283
5,overtime,41.883519
6,performance_rating,12.081784
7,total_working_years,32.589839
8,work_life_balance,6.69145
9,salary,16.976456


In [246]:
df['department'].unique()

array([nan, 'research_&_development', 'sales', 'human_resources'],
      dtype=object)

In [247]:
df.columns

Index(['age', 'attrition', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_number',
       'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement',
       'job_level', 'job_role', 'job_satisfaction', 'marital_status',
       'monthly_income', 'monthly_rate', 'num_companies_worked', 'overtime',
       'percent_salary_hike', 'performance_rating',
       'relationship_satisfaction', 'standard_hours', 'stock_option_level',
       'total_working_years', 'training_times_last_year', 'work_life_balance',
       'years_at_company', 'years_since_last_promotion',
       'years_with_curr_manager', 'date_birth', 'salary', 'remote_work'],
      dtype='object')

In [248]:
df.to_csv("df_transformado.csv", index=False)