# IBM HR Analytics Employee Attrition & Performance

Descubra los factores que conducen a la deserción de los empleados y explore preguntas como "muéstre un desglose de la distancia desde el hogar al trabajo por función laboral y deserción" o "compare el ingreso mensual promedio por educación y deserción". Este es un conjunto de datos ficticios creado por científicos de datos de IBM.

Principales variables:

|º|Education| EnvironmentSatisfaction |JobInvolvement|PerformanceRating|RelationshipSatisfaction|WorkLifeBalance
|:--:|:------:|:------:|:------:|:------:|:------:|:------:|
|1|'Below College'| 'Low'|'Low'|'Low'|'Low'|'Bad'|
|2|'College'|'Medium'|'Medium'|'Good'|'Medium'|'Good'|
|3|'Bachelor'|'High'|'High'|'Excellent'|'High'| 'Better'|
|4|'Master'|'Very High'|'Very High'|'Outstanding'|'Very High'|'Best'|
|5|'Doctor'||





### Importo librerias

In [4]:
import numpy as np
import pandas as pd
import matplotlib as mpl            # Matplotlib
import matplotlib.pyplot as plt     # El modulo Pyplot de Matplotlib
import seaborn as sns               # Seaborn

mpl.style.use('bmh')    # Establecemos un nuevo estilo

from prettytable import PrettyTable

## 1) DATA PROCESS & EXPLORATORY DATA ANALYSIS (EDA)¶


In [2]:
# Cargamos el dataset
df = pd.read_csv('Datos/WA_Fn-UseC-HR-Employee-Attrition.csv')
pd.set_option('max_columns', None)
df.sample(6)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
630,22,No,Travel_Rarely,1230,Research & Development,1,2,Life Sciences,1,872,4,Male,33,2,2,Manufacturing Director,4,Married,4775,19146,6,Y,No,22,4,1,80,2,4,2,1,2,2,2,2
423,30,No,Non-Travel,1398,Sales,22,4,Other,1,567,3,Female,69,3,3,Sales Executive,1,Married,8412,2890,0,Y,No,11,3,3,80,0,10,3,3,9,8,7,8
134,26,No,Travel_Rarely,1355,Human Resources,25,1,Life Sciences,1,177,3,Female,61,3,1,Human Resources,3,Married,2942,8916,1,Y,No,23,4,4,80,1,8,3,3,8,7,5,7
366,41,Yes,Travel_Frequently,143,Sales,4,3,Marketing,1,488,1,Male,56,3,2,Sales Executive,2,Single,9355,9558,1,Y,No,18,3,3,80,0,8,5,3,8,7,7,7
657,29,No,Travel_Rarely,1086,Research & Development,7,1,Medical,1,912,1,Female,62,2,1,Laboratory Technician,4,Divorced,2532,6054,6,Y,No,14,3,3,80,3,8,5,3,4,3,0,3
1304,40,No,Travel_Rarely,750,Research & Development,12,3,Life Sciences,1,1829,2,Female,47,3,2,Healthcare Representative,1,Divorced,4448,10748,2,Y,No,12,3,2,80,1,15,3,3,7,4,7,7


In [5]:
# Generamos una descripcion de cada campo
def df_explore(df):
    #Description table of the main characteristics of each column from a dataset
    print( 'Shape: ', df.shape)
    
    t = PrettyTable(['Column', 
                     'Type',
                     'Non-Null',
                     'Nulls',
                     'Unique',
                     'Example',
                    ])
    
    for c in df.columns:
        t.add_row([c,
                   df[c].dtype,
                   len(df[c])-np.sum(df[c].isna()),
                   np.sum(df[c].isna()),
                   np.count_nonzero(df[c].unique()),
                   df[~df[c].isnull()][c].iloc[0],
                  ]) 
    print(t)
    print()
    return

df_explore(df)

Shape:  (1470, 35)
+--------------------------+--------+----------+-------+--------+-----------------+
|          Column          |  Type  | Non-Null | Nulls | Unique |     Example     |
+--------------------------+--------+----------+-------+--------+-----------------+
|           Age            | int64  |   1470   |   0   |   43   |        41       |
|        Attrition         | object |   1470   |   0   |   2    |       Yes       |
|      BusinessTravel      | object |   1470   |   0   |   3    |  Travel_Rarely  |
|        DailyRate         | int64  |   1470   |   0   |  886   |       1102      |
|        Department        | object |   1470   |   0   |   3    |      Sales      |
|     DistanceFromHome     | int64  |   1470   |   0   |   29   |        1        |
|        Education         | int64  |   1470   |   0   |   5    |        2        |
|      EducationField      | object |   1470   |   0   |   6    |  Life Sciences  |
|      EmployeeCount       | int64  |   1470   |   0   | 

**OBS:** No hay valores nulos

### 1.1: Verifico los valore únicos de algunas variables


In [9]:
# Print unique values for categorical variables:
print('Attrition: ', df['Attrition'].unique(), "\n")
print('BusinessTravel: ', df['BusinessTravel'].unique(), "\n")
print('Department:', df['Department'].unique(), "\n")
print('Education Field:', df['EducationField'].unique(), "\n")          
print('Gender:', df['Gender'].unique(), "\n")
print('JobRole:', df['JobRole'].unique(), "\n")
print('MaritalStatus:', df['MaritalStatus'].unique(), "\n")
print('Over18:', df['Over18'].unique(), "\n")
print('OverTime:', df['OverTime'].unique(), "\n")

Attrition:  ['Yes' 'No'] 

BusinessTravel:  ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel'] 

Department: ['Sales' 'Research & Development' 'Human Resources'] 

Education Field: ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources'] 

Gender: ['Female' 'Male'] 

JobRole: ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources'] 

MaritalStatus: ['Single' 'Married' 'Divorced'] 

Over18: ['Y'] 

OverTime: ['Yes' 'No'] 



In [84]:
# Generamos una pretty tabla con valores unicos

def df_unique_val_col(df, col_list = [] ):
    #Description table of the main characteristics of each column from a dataset
    
    #Cuento la máxima cantidad de valores unicos que tienen las variables/columnas
    cant_table_col = 0
    for i in col_list:
        l = len(df[i].unique())
        if cant_table_col < l:
            cant_table_col = l
    
    # Creo la lista de campos "value_x" para armar la prettytable
    A = ['Columna']
    for i in range(9):
        A.append('Value_'+str(i+1))
        
    B = []
    for c in df[L]:
        B.append(df[c].unique())
        #print(B)
        B = []
        
        dg = pd.DataFrame(index = df[col_list].columns,
                          columns = A)    
        
        for c in dg.index:
            dg[c] = 


        
    print()
    return dg 



In [85]:
# lista de prueba
L = ['Attrition','BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
len(L)

9

In [86]:
df_unique_val_col(df,L)




Unnamed: 0,Columna,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9
Attrition,,,,,,,,,,
BusinessTravel,,,,,,,,,,
Department,,,,,,,,,,
EducationField,,,,,,,,,,
Gender,,,,,,,,,,
JobRole,,,,,,,,,,
MaritalStatus,,,,,,,,,,
Over18,,,,,,,,,,
OverTime,,,,,,,,,,


In [70]:
# probando
A = []
for c in df[L]:
    A.append(df[c].unique())
    print(A)
    A = []


[array(['Yes', 'No'], dtype=object)]
[array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)]
[array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)]
[array(['Life Sciences', 'Other', 'Medical', 'Marketing',
       'Technical Degree', 'Human Resources'], dtype=object)]
[array(['Female', 'Male'], dtype=object)]
[array(['Sales Executive', 'Research Scientist', 'Laboratory Technician',
       'Manufacturing Director', 'Healthcare Representative', 'Manager',
       'Sales Representative', 'Research Director', 'Human Resources'],
      dtype=object)]
[array(['Single', 'Married', 'Divorced'], dtype=object)]
[array(['Y'], dtype=object)]
[array(['Yes', 'No'], dtype=object)]


In [118]:
A = []
for i in range(9):
    A.append('Value_'+str(i+1))
        
dg = pd.DataFrame( index = A, columns = df[L].columns)

dg

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
Value_1,,,,,,,,,
Value_2,,,,,,,,,
Value_3,,,,,,,,,
Value_4,,,,,,,,,
Value_5,,,,,,,,,
Value_6,,,,,,,,,
Value_7,,,,,,,,,
Value_8,,,,,,,,,
Value_9,,,,,,,,,


In [120]:
for i in L:
    
    uni = df[i].unique()
    l = len(uni)
    
    for j in range(l):
        dg[i][j] = uni[j]
dg.replace(np.nan, 0)       
dg

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
Value_1,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
Value_2,No,Travel_Frequently,Research & Development,Other,Male,Research Scientist,Married,,No
Value_3,,Non-Travel,Human Resources,Medical,,Laboratory Technician,Divorced,,
Value_4,,,,Marketing,,Manufacturing Director,,,
Value_5,,,,Technical Degree,,Healthcare Representative,,,
Value_6,,,,Human Resources,,Manager,,,
Value_7,,,,,,Sales Representative,,,
Value_8,,,,,,Research Director,,,
Value_9,,,,,,Human Resources,,,


In [123]:
dg.Attrition.loc['Value_6']


nan