In [5]:
# Importamos las librerías que necesitamos

# Tratamiento de datos
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Ocultamos las advertencias para evitar mensajes innecesarios
import warnings
warnings.filterwarnings("ignore")

# Configuración
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [6]:
# Lo primero que hacemos es cargar el dataframe que vamos a usar
df = pd.read_csv("spaces_hr_raw_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30.0,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,,1971,"199990,00$",,,1
2,2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220.0,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1
3,3,47,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,"14307,50$","37199,50$",3,Y,,19,30,2,Full Time,2,,2,,20,,5,6,"14307,50$",1976,"171690,00$",,,False
4,4,46,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,"12783,92$","33238,20$",2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,"12783,92$",1977,,,,0


In [7]:
def shape(df):
    print('Shape')
    print(df.shape)
    print(f"The number of rows is {df.shape[0]}, and the number of columns is {df.shape[1]}")
    print('---------------------')
    print('The columns are:')
    print(df.columns)

In [8]:
shape(df)

Shape
(1678, 42)
The number of rows is 1678, and the number of columns is 42
---------------------
The columns are:
Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'over18', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager',
       'sameasmonthlyincome', 'datebirth', 'salary', 'roledepartament',
       'numberchildren', 'remotework'],
      dtype='object')


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1678 entries, 0 to 1677
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1678 non-null   int64  
 1   age                       1678 non-null   object 
 2   attrition                 1678 non-null   object 
 3   businesstravel            877 non-null    object 
 4   dailyrate                 1678 non-null   float64
 5   department                312 non-null    object 
 6   distancefromhome          1678 non-null   int64  
 7   education                 1678 non-null   int64  
 8   educationfield            904 non-null    object 
 9   employeecount             1678 non-null   int64  
 10  employeenumber            1678 non-null   int64  
 11  environmentsatisfaction   1678 non-null   int64  
 12  gender                    1678 non-null   int64  
 13  hourlyrate                411 non-null    float64
 14  jobinvol

In [10]:
## Variables importantes: environmentsatisfaction, jobinvolvement
# Variables explicativas: monthlyincome, overtime, worklifebalance, remotework, yearsatcompany, yearsincurrentrole, yearssincelastpromotion, percentsalaryhike, gender, salary, dailyrate, hourlyrate, standardhours
#        numberchildren, businesstravel, distancefromhome, education, educationfield, age, trainingtimeslastyear, stockoptionlevel
# Variables resultados: attrition, jobsatisfaction
# Variables identificatorias: employeenumber

In [11]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'over18', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearsincurrentrole',
       'yearssincelastpromotion', 'yearswithcurrmanager',
       'sameasmonthlyincome', 'datebirth', 'salary', 'roledepartament',
       'numberchildren', 'remotework'],
      dtype='object')

In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,1678.0,838.5,484.541192,0.0,419.25,838.5,1257.75,1677.0
dailyrate,1678.0,668.079714,470.787298,104.103175,290.03551,556.256661,971.956349,2063.388889
distancefromhome,1678.0,4.504172,14.652066,-49.0,2.0,5.0,11.0,29.0
education,1678.0,2.932658,1.02427,1.0,2.0,3.0,4.0,5.0
employeecount,1678.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
employeenumber,1678.0,809.859952,467.084867,1.0,403.25,813.5,1215.75,1614.0
environmentsatisfaction,1678.0,4.264005,6.912695,1.0,2.0,3.0,4.0,49.0
gender,1678.0,0.398689,0.489774,0.0,0.0,0.0,1.0,1.0
hourlyrate,411.0,83.140768,57.272101,13.012897,36.254439,69.532083,116.987103,255.963294
jobinvolvement,1678.0,2.740763,0.710359,1.0,2.0,3.0,3.0,4.0


In [13]:
df.isnull().sum()/df.shape[0]*100

Unnamed: 0                    0.000000
age                           0.000000
attrition                     0.000000
businesstravel               47.735399
dailyrate                     0.000000
department                   81.406436
distancefromhome              0.000000
education                     0.000000
educationfield               46.126341
employeecount                 0.000000
employeenumber                0.000000
environmentsatisfaction       0.000000
gender                        0.000000
hourlyrate                   75.506555
jobinvolvement                0.000000
joblevel                      0.000000
jobrole                       0.000000
jobsatisfaction               0.000000
maritalstatus                40.226460
monthlyincome                29.141836
monthlyrate                   0.000000
numcompaniesworked            0.000000
over18                       55.899881
overtime                     41.477950
percentsalaryhike             0.000000
performancerating        

In [14]:
# Variables con valores nulos
def nulls(df,count=0,share=0):
    
    nulls = df.isnull().sum()
    nulls_share = df.isnull().sum()/df.shape[0]*100
    with_nulls_share = nulls_share[nulls_share > 0]
    
    if count == 1:
        print('Count of nulls')
        with_nulls = nulls[nulls > 0]
        print (with_nulls.sort_values(ascending=False))
        print('-----------------------------------')
    
    if share == 1:
        print('share of nulls')
        print (with_nulls_share.sort_values(ascending=False))
    
    nulls_list = with_nulls_share.to_frame(name='perc_nulos').reset_index().rename(columns={'index': 'var'})
    
    return nulls_list

In [15]:
nulls(df,1,1)

Count of nulls
numberchildren         1678
yearsincurrentrole     1643
roledepartament        1366
department             1366
hourlyrate             1267
over18                  938
businesstravel          801
educationfield          774
overtime                696
maritalstatus           675
totalworkingyears       549
monthlyincome           489
sameasmonthlyincome     489
standardhours           351
salary                  285
performancerating       200
worklifebalance         114
dtype: int64
-----------------------------------
share of nulls
numberchildren         100.000000
yearsincurrentrole      97.914184
roledepartament         81.406436
department              81.406436
hourlyrate              75.506555
over18                  55.899881
businesstravel          47.735399
educationfield          46.126341
overtime                41.477950
maritalstatus           40.226460
totalworkingyears       32.717521
monthlyincome           29.141836
sameasmonthlyincome     29.141836
sta

Unnamed: 0,var,perc_nulos
0,businesstravel,47.735399
1,department,81.406436
2,educationfield,46.126341
3,hourlyrate,75.506555
4,maritalstatus,40.22646
5,monthlyincome,29.141836
6,over18,55.899881
7,overtime,41.47795
8,performancerating,11.918951
9,standardhours,20.917759


In [16]:
df['department'].describe()

count                          312
unique                           3
top        Research & Development 
freq                           203
Name: department, dtype: object

In [17]:
df['department'].value_counts()

department
Research & Development     203
Sales                       93
Human Resources             16
Name: count, dtype: int64

In [18]:
df['roledepartament'].value_counts()

roledepartament
rESEArcH SCiEntIst  -  Research & Development            2
MaNAgeR  -  Sales                                        2
hEalthCaRe reprEseNTaTiVe  -  Research & Development     2
SaleS EXeCuTivE  -  Sales                                2
labORAtoRy tEcHNICIAN  -  Research & Development         2
                                                        ..
labORaTORY teChNIcIaN  -  Research & Development         1
saLES eXEcUTiVE  -  Sales                                1
mANUfacTURiNG dIRectOR  -  Research & Development        1
huMAn ResOurces  -  Human Resources                      1
sAleS EXECUtIvE  -  Sales                                1
Name: count, Length: 301, dtype: int64

In [19]:
df['yearsincurrentrole'].value_counts()

yearsincurrentrole
2,0     12
7,0      5
0,0      4
4,0      3
1,0      3
11,0     2
6,0      2
3,0      2
13,0     1
12,0     1
Name: count, dtype: int64

In [20]:
df['hourlyrate'].describe()

count    411.000000
mean      83.140768
std       57.272101
min       13.012897
25%       36.254439
50%       69.532083
75%      116.987103
max      255.963294
Name: hourlyrate, dtype: float64

In [13]:
df.duplicated().sum()

0

In [21]:
def unique(df):
    coltypes = []
    coltypes = df.dtypes.tolist()
    
    for i in range(len(coltypes)):
        coltypes[i] = str(coltypes[i])
    
    unique_types = list(dict.fromkeys(coltypes))
    print(unique_types)
    return unique_types

In [24]:
unique = unique(df)

['int64', 'object', 'float64']


In [23]:
def description_by_type(df, types_list):
    for type in types_list:
        print('Description of variables of type', type)
        print(df.describe(include=[type]).T)
        print('-----------------------------------------')

In [25]:
description_by_type(df,unique)

Description of variables of type int64
                           count         mean         std     min      25%  \
Unnamed: 0                1678.0   838.500000  484.541192     0.0   419.25   
distancefromhome          1678.0     4.504172   14.652066   -49.0     2.00   
education                 1678.0     2.932658    1.024270     1.0     2.00   
employeecount             1678.0     1.000000    0.000000     1.0     1.00   
employeenumber            1678.0   809.859952  467.084867     1.0   403.25   
environmentsatisfaction   1678.0     4.264005    6.912695     1.0     2.00   
gender                    1678.0     0.398689    0.489774     0.0     0.00   
jobinvolvement            1678.0     2.740763    0.710359     1.0     2.00   
joblevel                  1678.0     2.064362    1.099425     1.0     1.00   
jobsatisfaction           1678.0     2.743147    1.105172     1.0     2.00   
numcompaniesworked        1678.0     2.670441    2.501133     0.0     1.00   
percentsalaryhike        

In [44]:
df = df.drop(columns=['numberchildren', 'over18', 'yearsincurrentrole'])

In [45]:
df.columns

Index(['Unnamed: 0', 'age', 'attrition', 'businesstravel', 'dailyrate',
       'department', 'distancefromhome', 'education', 'educationfield',
       'employeecount', 'employeenumber', 'environmentsatisfaction', 'gender',
       'hourlyrate', 'jobinvolvement', 'joblevel', 'jobrole',
       'jobsatisfaction', 'maritalstatus', 'monthlyincome', 'monthlyrate',
       'numcompaniesworked', 'overtime', 'percentsalaryhike',
       'performancerating', 'relationshipsatisfaction', 'standardhours',
       'stockoptionlevel', 'totalworkingyears', 'trainingtimeslastyear',
       'worklifebalance', 'yearsatcompany', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'remotework'],
      dtype='object')

In [47]:
df.shape

(1678, 39)