In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Dataset/general_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df.shape #checking shape of data

(4410, 24)

In [4]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

#### Filling Null Values by mean and median

In [5]:
df['TotalWorkingYears'] = df['TotalWorkingYears'].fillna(11.28) 
# 11.28 is the mean of TotalWorkingYears column 
df['NumCompaniesWorked'] = df['NumCompaniesWorked'].fillna(2)
# 2 is the median of NumCompaniesWorked

In [6]:
# dropping null values

df.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

- No null values are present now.

In [7]:
# dropping duplicates

df = df.drop_duplicates()

In [8]:
df.shape

(4410, 24)

- As the shape remains same, so there is no duplicate Column in this dataset.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

### Creating Dummy Variable of Attrition Column

In [10]:
dummy = pd.get_dummies(df['Attrition'], drop_first= True)
dummy

Unnamed: 0,Yes
0,0
1,1
2,0
3,0
4,0
...,...
4405,0
4406,0
4407,0
4408,0


In [11]:
df = pd.concat([df,dummy], axis=1)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Yes
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,Y,11,8,0,1.0,6,1,0,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,Y,23,8,1,6.0,3,5,1,4,1
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,Y,15,8,3,5.0,2,5,0,3,0
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,Y,11,8,3,13.0,5,8,7,5,0
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,Y,12,8,2,9.0,2,6,0,4,0


In [12]:
df = df.drop('Attrition',axis=1)
df.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Yes
0,51,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,...,Y,11,8,0,1.0,6,1,0,0,0
1,31,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,...,Y,23,8,1,6.0,3,5,1,4,1
2,32,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,...,Y,15,8,3,5.0,2,5,0,3,0
3,38,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,...,Y,11,8,3,13.0,5,8,7,5,0
4,32,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,...,Y,12,8,2,9.0,2,6,0,4,0


In [13]:
df.rename(columns={'Yes':'Attrition'}, inplace=True)

In [14]:
df.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,51,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,...,Y,11,8,0,1.0,6,1,0,0,0
1,31,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,...,Y,23,8,1,6.0,3,5,1,4,1
2,32,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,...,Y,15,8,3,5.0,2,5,0,3,0
3,38,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,...,Y,11,8,3,13.0,5,8,7,5,0
4,32,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,...,Y,12,8,2,9.0,2,6,0,4,0


### Importing Pearson Correlation Coefficient

In [15]:
from scipy.stats import pearsonr

### Finding Correlation between Attrition and Age

H0 : There is no correlation between the Attrition and Age of employee

H1 : There is Significant correlation between the Attrition and Age of employee

In [16]:
stats,p = pearsonr(df.Attrition, df.Age)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.15920500686577507 and p value is 1.996801615893171e-26


- The value of r lies in between 0 and -1 (Very close to zero)
- p < 0.05 , So Null Hypothesis is rejected and Alternative Hypothesis is accpeted
- There is Weak Negative Correlation Between Age and Atrrition
- There is Significant Correlation Between Age and Atrrition

### Finding Correlation between Attrition and Distance from Home

H0 : There is no correlation between the Attrition and Distance from Home

H1 : There is Significant correlation between the Attrition and Distance from Home

In [17]:
stats,p = pearsonr(df.Attrition, df.DistanceFromHome)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.009730141010179438 and p value is 0.5182860428049617


- The value of r lies in between 0 and -1 So, There is Weak Negative Correlation Between Distance from Home and Atrrition
- p value > 0.05 .
- So, Null Hypothesis is accepted and Alternative Hypothesis is rejected ie,
- There is No Correlation Between Distance from Home and Atrrition

### Finding Correlation between Attrition and Education

H0 : There is no correlation between the Attrition and Education

H1 : There is Significant correlation between the Attrition and Education

In [18]:
stats,p = pearsonr(df.Attrition, df.Education)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.015111167710968734 and p value is 0.3157293177118575


- The value of r lies in between 0 and -1. So , There is Weak Negative Correlation Between Education and Atrrition
- p value > 0.05 .
- So, Null Hypothesis is accepted and Alternative Hypothesis is rejected ie,
- There is No Correlation Between Education and Atrrition

### Finding Correlation between Attrition and EmployeeID

H0 : There is no correlation between the Attrition and EmployeeID

H1 : There is Significant correlation between the Attrition and EmployeeID

In [19]:
stats,p = pearsonr(df.Attrition, df.EmployeeID)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.004729122995065954 and p value is 0.7535487401882808


- The value of r lies in between 0 and -1. So, There is Weak Negative Correlation Between EmployeeID and Atrrition
- p value > 0.05 .
- So, Null Hypothesis is accepted and Alternative Hypothesis is rejected ie,
- There is No Correlation Between EmployeeID and Atrrition

### Finding Correlation between Attrition and Job Level

H0 : There is no correlation between the Attrition and Job Level

H1 : There is Significant correlation between the Attrition and Job Level

In [20]:
stats,p = pearsonr(df.Attrition, df.JobLevel)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.010289713287495119 and p value is 0.49451717271817114


- The value of r lies in between 0 and -1 SO, There is Weak Negative Correlation Between Job Level and Atrrition
- p value > 0.05 .
- So, Null Hypothesis is accepted and Alternative Hypothesis is rejected ie,
- There is No Correlation Between Job Level and Atrrition

### Finding Correlation between Attrition and Monthly Income

H0 : There is no correlation between the Attrition and Monthly Income

H1 : There is Significant correlation between the Attrition and Monthly Income

In [21]:
stats,p = pearsonr(df.Attrition, df.MonthlyIncome)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.031176281698114025 and p value is 0.0384274849060192


- The value of r lies in between 0 and -1. So, There is Weak Negative Correlation Between Monthly Income and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is significant Correlation Between Monthly Income and Atrrition

### Finding Correlation between Attrition and NumCompaniesWorked

H0 : There is no correlation between the Attrition and NumCompaniesWorked

H1 : There is Significant correlation between the Attrition and NumCompaniesWorked

In [22]:
stats,p = pearsonr(df.Attrition, df.NumCompaniesWorked)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is 0.04206354468973415 and p value is 0.0052093756801436086


- The value of r lies in between 0 and 1 So, There is Weak Positive Correlation Between NumCompaniesWorked and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between NumCompaniesWorked and Atrrition

### Finding Correlation between Attrition and PercentSalaryHike

H0 : There is no correlation between the Attrition and PercentSalaryHike

H1 : There is Significant correlation between the Attrition and PercentSalaryHike

In [23]:
stats,p = pearsonr(df.Attrition, df.PercentSalaryHike)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is 0.0325325948910522 and p value is 0.030743386433369824


- The value of r lies in between 0 and 1 So, There is Weak Positive Correlation Between PercentSalaryHike and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between PercentSalaryHike and Atrrition

### Finding Correlation between Attrition and StockOptionLevel

H0 : There is no correlation between the Attrition and StockOptionLevel

H1 : There is Significant correlation between the Attrition and StockOptionLevel

In [24]:
stats,p = pearsonr(df.Attrition, df.StockOptionLevel)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.0068388524032610195 and p value is 0.6498072937477383


- The value of r lies in between 0 and -1 so, There is Weak Negative Correlation Between Stock Option Level and Atrrition
- p value > 0.05 .
- So, Null Hypothesis is accepted and Alternative Hypothesis is rejected ie,
- There is No Correlation Between Stock Option Level and Atrrition

### Finding Correlation between Attrition and Total Working Years

H0 : There is No Correlation between the Attrition and Total Working Years

H1 : There is Significant Correlation between the Attrition and Total Working Years

In [25]:
stats,p = pearsonr(df.Attrition, df.TotalWorkingYears)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.17011136355964646 and p value is 5.4731597518148054e-30


- The value of r lies in between 0 and -1 So,There is Weak Negative Correlation Between Total Working Years and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between Total Working Years and Atrrition

### Finding Correlation between Attrition and Training Times LastYear

H0 : There is no correlation between the Attrition and Training Times LastYear

H1 : There is Significant correlation between the Attrition and Training Times LastYear

In [26]:
stats,p = pearsonr(df.Attrition, df.TrainingTimesLastYear)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.049430576244253066 and p value is 0.001024706191536548


- The value of r lies in between 0 and -1 so, There is Weak Negative Correlation Between Training Times LastYear and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between Training Times LastYear and Atrrition

### Finding Correlation between Attrition and Years At Company

H0 : There is No Correlation between the Attrition and Years At Company

H1 : There is Significant Correlation between the Attrition and Years At Company

In [27]:
stats,p = pearsonr(df.Attrition, df.YearsAtCompany)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.13439221398997386 and p value is 3.163883122493571e-19


- The value of r lies in between 0 and -1 So, There is Weak Negative Correlation Between Years At Company and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between Years At Company and Atrrition

### Finding Correlation between Attrition and Years Since Last Promotion

H0 : There is No Correlation between the Attrition and Years Since Last Promotion

H1 : There is Significant Correlation between the Attrition and Years Since Last Promotion

In [28]:
stats,p = pearsonr(df.Attrition, df.YearsSinceLastPromotion)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.03301877514258329 and p value is 0.02833033618939086


- The value of r lies in between 0 and -1 So There is Weak Negative Correlation Between Years Since Last Promotion and Atrrition
- p value < 0.05 .
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between Years Since Last Promotion and Atrrition

### Finding Correlation between Attrition and Years With Current Manager

H0 : There is No Correlation between the Attrition and Years With Current Manager

H1 : There is Significant Correlation between the Attrition and Years With Current Manager

In [29]:
stats,p = pearsonr(df.Attrition, df.YearsWithCurrManager)
print('The r value is {} and p value is {}'.format(stats,p))

The r value is -0.15619931590162422 and p value is 1.7339322652951965e-25


- The value of r lies in between 0 and -1 So, There is Weak Negative Correlation Between Years With Current Manager and Atrrition
- p value < 0.05
- So, Null Hypothesis is rejected and Alternative Hypothesis is accepted ie,
- There is Significant Correlation Between Years With Current Manager and Atrrition

### Correlation Matrix

In [30]:
df.corr()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
Age,1.0,0.006963,-0.035706,,0.008649,-0.002884,-0.044314,0.298869,-0.033137,,-0.031753,0.680419,-0.027308,0.311309,0.216513,0.202089,-0.159205
DistanceFromHome,0.006963,1.0,-0.008638,,-0.001097,-0.037329,-0.021607,-0.013514,0.038125,,0.011169,0.009361,-0.009001,0.031684,0.00229,0.021584,-0.00973
Education,-0.035706,-0.008638,1.0,,-0.00968,0.045746,0.00641,-0.016195,-0.040531,,0.001261,-0.010712,0.010472,0.00608,0.02249,0.005358,-0.015111
EmployeeCount,,,,,,,,,,,,,,,,,
EmployeeID,0.008649,-0.001097,-0.00968,,1.0,-0.003303,0.007338,-0.000789,-0.004456,,-0.014254,-0.001061,-0.010191,0.004086,0.000256,0.008579,-0.004729
JobLevel,-0.002884,-0.037329,0.045746,,-0.003303,1.0,0.047316,-0.010116,0.010973,,0.000993,-0.036934,-0.0325,-0.064219,-0.060811,-0.055251,-0.01029
MonthlyIncome,-0.044314,-0.021607,0.00641,,0.007338,0.047316,1.0,-0.020515,0.004325,,0.02693,-0.033758,0.050112,0.000995,0.065219,0.024304,-0.031176
NumCompaniesWorked,0.298869,-0.013514,-0.016195,,-0.000789,-0.010116,-0.020515,1.0,0.031062,,0.017282,0.238381,-0.032225,-0.11749,-0.036326,-0.109207,0.042064
PercentSalaryHike,-0.033137,0.038125,-0.040531,,-0.004456,0.010973,0.004325,0.031062,1.0,,0.012548,-0.018717,-0.037392,-0.029707,-0.029542,-0.040864,0.032533
StandardHours,,,,,,,,,,,,,,,,,
