In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mlt
import matplotlib.pyplot as plt
from tabulate import tabulate

## EDA for Employee Attrition

In [None]:
df=pd.read_csv("DS1_C6_S2_EmployeeAttrition_Data_Concept.csv")

In [None]:
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print(df.columns)

### Displaying the dataset information

In [None]:
df.info()

In [None]:
df.isnull().sum()

###  Level-1 Analysis

In [None]:
def seperate_data_types(df):
    categorical=[]
    continuous=[]
    for column in df.columns:
        if df[column].nunique()<50:
            categorical.append(column)
        else:
            continuous.append(column)
    return categorical,continuous

In [None]:
categorical,continuous=seperate_data_types(df)

from tabulate import tabulate
table = [categorical, continuous]
print(tabulate({"Categorical":categorical,
                "continuous": continuous}, headers = ["categorical", "continuous"]))

In [None]:
def info_of_cat(col):
    print(f"Unique values of {col} are:{df[col].unique()}")
    print(f"Mode of {col} is:{df[col].mode()[0]}")
    print(f"Number of missing values in {col} is {df[col].isnull().sum()}")

### Analyzing 'Attrition' column

In [None]:
info_of_cat('Attrition')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('Attrition')
percentage=df['Attrition'].value_counts()
labels=list(percentage.index)

sns.countplot(x=df["Attrition"],ax=ax[0])
plt.pie(percentage, labels=labels, autopct="%0.2f%%")

plt.show()

#### Interpretion - Almost 84% of employees have continued working with the company whereas 16% have been attrited

In [None]:
per_Attrn=df['Attrition'].value_counts()
per_Attrn

In [None]:
labels=list(per_Attrn.index)
labels

### JobInvolvement column analysis

In [None]:
info_of_cat('JobInvolvement')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('JobInvolvement')
percentage=df['JobInvolvement'].value_counts()
labels=list(percentage.index)

sns.countplot(x=df["JobInvolvement"],ax=ax[0])
plt.pie(percentage, labels=labels, autopct="%0.2f%%")

plt.show()

##### Majority of the employees about 59% have been rated to 3 out of 4 for their job involvement

#### MaritalStatus Analysis

In [None]:
info_of_cat('MaritalStatus')

In [None]:
df.MaritalStatus.fillna('Married', inplace=True)

In [None]:
info_of_cat('MaritalStatus')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('MaritalStatus')
percentage=df['MaritalStatus'].value_counts()
labels=list(percentage.index)

sns.countplot(x=df["MaritalStatus"],ax=ax[0])
plt.pie(percentage, labels=labels, autopct="%0.2f%%")

plt.show()

### Company is having 48% of employees are married whereas 30% are single and rest are Divorced

## Analysis For Number of Companies Worked

In [None]:
info_of_cat('NumCompaniesWorked')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('NumCompaniesWorked')
percentage=df['NumCompaniesWorked'].value_counts()
labels=list(percentage.index)

sns.countplot(x=df["NumCompaniesWorked"],ax=ax[0])
plt.pie(percentage, labels=labels, autopct="%0.2f%%")

plt.show()

### The Data shows that 35% of employees have been working for the single company

## WorkLife Balance column Analysis

In [None]:
info_of_cat('WorkLifeBalance')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('WorkLifeBalance')
percentage=df['WorkLifeBalance'].value_counts()
labels=list(percentage.index)

sns.countplot(x=df["WorkLifeBalance"],ax=ax[0])
plt.pie(percentage, labels=labels, autopct="%0.2f%%")

plt.show()

### Interpretation - Majority of Employees of the company have rated 3 out of 5 for work life balance while working with the company

In [None]:
def info_of_numerical(col):
    print(f"The mean of the {col} is {df[col].mean()}")
    print(f"The median of the {col} is {df[col].median()}")
    print(f"The mode of the {col} is {df[col].mode()[0]}")
    print(f"The standard deviation of the {col} is {df[col].std()}")
    print(f"Number of missing values in the {col} is {df[col].isnull().sum()}")

## Employee Age analysis

In [None]:
info_of_numerical('Age')

In [None]:
df['Age'].median()

In [None]:
df['Age'].fillna(df['Age'].median(),inplace=True)

In [None]:
info_of_numerical('Age')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,7))
sns.histplot(x=df["Age"], ax=ax[0],color='b')
sns.boxplot(x=df["Age"], ax=ax[1],color='green')

#### Interpretation - Majority of the employees working for the comapany are range of 25 to 40 yrs of Age

## Business Travel Analysis

In [None]:
info_of_cat('BusinessTravel')

In [None]:
df['BusinessTravel'].fillna(df['BusinessTravel'].mode()[0],inplace=True)

In [None]:
info_of_cat('BusinessTravel')

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('Business Travel Status')
percentage=df['BusinessTravel'].value_counts()
sns.countplot(x=df['BusinessTravel'],ax=ax[0])
plt.pie(x=percentage,labels=percentage.index,autopct="%0.02f%%")
plt.show()

### Interpretation - Out of total number of employees 18% travel frequently, 72% travel rarely and 10% never travel for the company.

## Departments Analysis

In [None]:
info_of_cat('Department')

In [None]:
df['Department'].fillna(df['Department'].mode()[0],inplace=True)

In [None]:
info_of_cat('Department')

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('Departments')
percentage=df['Department'].value_counts()
labels=percentage.index
sns.countplot(x=df["Department"],ax=ax[0])
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")

plt.show()



### Interpretation - More than 65% of employees are engaged in Research & Development and more than 30% are engaged in Sales department.

## Analysis for Distance from home

In [None]:
info_of_cat("DistanceFromHome")

In [None]:
df["DistanceFromHome"].max()

In [None]:
df["DistanceFromHome"].min()

In [None]:
a=0
b=0
c=0
d=0
for i in df['DistanceFromHome']:
    if i>=1 and i<=5:
        a=a+1
    elif i>5 and i<=10:
        b=b+1
    elif i>10 and i<=20:
        c=c+1
    else:
        d=d+1
dist_range=['0-5kms','6-10kms','11-20kms','21-30kms']
count=[a,b,c,d]
df1=pd.DataFrame(count,index=dist_range)

In [None]:
df1.columns=["Count_of_employees"]
df1

In [None]:
dist=df1.index.tolist()

In [None]:
countofemployees=df1.iloc[:,0].tolist()

In [None]:
fig,ax=plt.subplots(figsize=(5,5))
ax.set_title("Distance From Home and Count of Employees Respectively")
percentage=countofemployees
labels=dist
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

In [None]:
plt.title("Distance From Home")
plt.bar(dist,countofemployees)
plt.show()

#### Interpretation - As above pie and bar chart indicates employees 42% reside in 0-5kms, 27% reside within 6-10kms,17% reside within 11-20kms, 14% reside within 21-30kms distance from company. So number of employees decrease as the distance between company and employees from home increases.

## Analysis for JobRole

In [None]:
info_of_cat('JobRole')

In [None]:
df['JobRole'].fillna(df['JobRole'].mode()[0],inplace=True)

In [None]:
info_of_cat('JobRole')

In [2]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('JobRole')
percentage=df['JobRole'].value_counts()
labels=percentage.index
g=sns.countplot(x=df["JobRole"],ax=ax[0])
g.set_xticklabels(labels=labels,rotation=45)
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

NameError: name 'plt' is not defined

#### Interpretation - No. of Employees working as Sales Executive, Research Scientist and Laboratory Technician are together 60% of total count of whch sales executives take almost 23% in counts.

## Analysis for OverTime

In [None]:
info_of_cat('OverTime')

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('OverTime')
percentage=df['OverTime'].value_counts()
labels=percentage.index
sns.countplot(x=df["OverTime"],ax=ax[0])
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

#### Only 28.3% of employees are working over time as per the data.

## Analysis of Employees Performance Rating

In [None]:
info_of_cat('PerformanceRating')

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('PerformanceRating')
percentage=df['PerformanceRating'].value_counts()
labels=percentage.index
sns.countplot(x=df["PerformanceRating"],ax=ax[0])
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

#### Interpretation - More than 84% of emmployees have been rated 3 out of 5 for their performance and other have, 4 out of 5

## Analysis considering Gender

In [None]:
info_of_cat('Gender')

In [None]:
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('Gender')
percentage=df['Gender'].value_counts()
labels=percentage.index
sns.countplot(x=df["Gender"],ax=ax[0])
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

#### Interpretation - Company is having approximately 61% of Male and 39% of Female employees.

## TrainingTimesLastYear Analysis

In [None]:
info_of_cat('TrainingTimesLastYear')

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title('TrainingTimesLastYear')
percentage=df['TrainingTimesLastYear'].value_counts()
labels=percentage.index
sns.countplot(x=df["TrainingTimesLastYear"],ax=ax[0])
plt.pie(x=percentage,labels=labels,autopct="%0.02f%%")
plt.show()

#### Interpretation - Around 70% of empoyees had their trainings 2-3 years ago and rest others have been trained more than 3 years ago keeping the expection for 8.5% employees who were trained less than or a year ago

## Analysis for Numerical or Continuous Data columns

In [None]:
def info_of_numerical(col):
    print(f"The mean of the {col} is {df[col].mean()}")
    print(f"The median of the {col} is {df[col].median()}")
    print(f"The mode of the {col} is {df[col].mode()[0]}")
    print(f"The standard deviation of the {col} is {df[col].std()}")
    print(f"Number of missing values in the {col} is {df[col].isnull().sum()}")

## Analysis for HourlyRate

In [None]:
info_of_numerical('HourlyRate')

In [None]:
fig, ax=plt.subplots(1,2,figsize=(20,7))
sns.histplot(x=df["HourlyRate"], ax=ax[0],color='b')
sns.boxplot(x=df["HourlyRate"], ax=ax[1],color='green')

#### Hourly rates for the employees lie in the range of 30 to 100. There majority of employees lie in the range of 45 to 85. Also the data shows no skew ness with respect to its median.

## Analysis for MonthlyIncome

In [None]:
info_of_numerical("MonthlyIncome")

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,7))
sns.boxplot(x=df["MonthlyIncome"], ax=ax[0], color='green')
sns.histplot(x=df["MonthlyIncome"], ax=ax[1], color='b')
plt.show()

#### Interpretation - Most of the employees have salaries in the range 3000 to 8000. Although the box plot shows some skewness from the median salary saying more emplyees are having salary lower than the average of total salaries.

### Level-2 Analysis

In [None]:
categorical,continuous=seperate_data_types(df)

from tabulate import tabulate
table = [categorical, continuous]
print(tabulate({"Categorical":categorical,
                "continuous": continuous}, headers = ["categorical", "continuous"]))

## Studying Attrition with respect to DistanceFromHome

In [None]:
fig,ax = plt.subplots(figsize=(15,7))
sns.countplot(x = "DistanceFromHome", hue = "Attrition", data = df)
plt.show()

#### Interpretation : For the employees with their home within 10kms, the attrition count is very low as compared to the employees still working with the company.

### Studying Attrition with respect to Department

In [None]:
info_of_cat("Department")

In [None]:
df["Department"].mode()

In [None]:
dept=df["Department"].fillna("Research & Development", inplace=True)

In [None]:
info_of_cat("Department")

In [None]:
fig,ax=plt.subplots(figsize=(15,7))
sns.countplot(x=df["Department"],hue=df["Attrition"])
plt.show()

#### Interpretation:Majority of the attrition have been hapennig from the Sales and R&D deparment. Cpunt of attrition in R&D department is 2 times that of in sales department.

### Studying Attrition with respect to Age

In [None]:
info_of_cat("Age")

In [None]:
df['Age'].fillna(df['Age'].mode()[0], inplace=True)

In [None]:
info_of_cat("Age")

In [None]:
fig,ax=plt.subplots(figsize=(25,7))
sns.countplot(x=df["Age"], hue=df["Attrition"])
plt.show()

#### Attrition of employees are more for age group 26 to 38 years. With increase in age attrition is reduced.

### Studying Attrition with respect to BusinessTravel

In [None]:
info_of_cat('BusinessTravel')

In [None]:
df["BusinessTravel"].fillna(df["BusinessTravel"].mode()[0], inplace=True)

In [None]:
info_of_cat('BusinessTravel')

In [None]:
ay=0
an=0
by=0
bn=0
cy=0
cn=0
for i in range(0,len(df)):
    if (df.loc[i,'BusinessTravel']=='Travel_Rarely') & (df.loc[i,'Attrition']=='Yes'):
        ay=ay+1
    elif (df.loc[i,'BusinessTravel']=='Travel_Rarely') & (df.loc[i,'Attrition']=='No'):
        an=an+1
    elif (df.loc[i,'BusinessTravel']=='Travel_Frequently') & (df.loc[i,'Attrition']=='Yes'):
        by=by+1
    elif (df.loc[i,'BusinessTravel']=='Travel_Frequently') & (df.loc[i,'Attrition']=='No'):
        bn=bn+1
    elif (df.loc[i,'BusinessTravel']=='Non-Travel') & (df.loc[i,'Attrition']=='Yes'):
        cy=cy+1
    else:
        cn=cn+1
print(ay)
print(an)
print(by)
print(bn)
print(cy)
print(cn)


In [None]:
Travel_Rarely=[ay,an]
Travel_Frequently=[by,bn]
Non_Travel=[cy,cn]

df2=pd.DataFrame([Travel_Rarely,Travel_Frequently,Non_Travel],columns=['Yes','No'],index=['Travel_Rarely','Travel_Frequently','Non-Travel'])
df2.plot(kind='bar',stacked=True,color=['blue','orange','green'])
plt.show()

#### Bar chart indicates thats employees Travelling rarely show only 15% attrition, for the employees travelling frequently show about 25% attrition and employees that do not travel show only 7.5% attrition. So, we can say that tarvlling frequently leads to higher attrition

## Studying Attrition with respect to Education Field

In [None]:
info_of_cat('EducationField')

In [None]:
df['EducationField'].fillna(df['EducationField'].mode()[0], inplace=True)

In [None]:
info_of_cat('EducationField')

In [None]:
sns.countplot(x='EducationField',hue='Attrition',data=df)

#### Count of attrition have been very high for employees from Life Sciences and Medical Field.

## Gender wise analysis for Attrition

In [None]:
info_of_cat('Gender')

In [None]:
sns.countplot(x='Gender',hue='Attrition',data=df)

#### Female and Male employees show almost same level of attrition as compared to their count.

## Attrition analysis based on Job Role

In [None]:
info_of_cat('JobRole')

In [None]:
a=df['JobRole'].value_counts()
labels=a.index
g=sns.countplot(x='JobRole',hue='Attrition',data=df)
g.set_xticklabels(labels=labels,rotation=90)
plt.show()

#### For job role of sales representative is about 80% and 15% to 30% for Sales Executive, Research Scientist and Laboratory Tehnician

## Attrition analysis based on Job Satisfaction

In [None]:
info_of_cat('JobSatisfaction')

In [None]:
sns.countplot(x='JobSatisfaction',hue='Attrition',data=df)

#### Employes those have rated higher for job satisfaction show very low attritions

## Attrition analysis based on Marital Status

In [None]:
info_of_cat('MaritalStatus')

In [None]:
sns.countplot(x='MaritalStatus',hue='Attrition',data=df)

#### Employees those have Marital Status as Single show higher level or percentage of attritions as compared to others.

## Attrition analysis based on NumCompaniesWorked

In [None]:
info_of_cat('NumCompaniesWorked')

In [None]:
sns.countplot(x='NumCompaniesWorked',hue='Attrition',data=df)

#### More than 600 employees have been working for 1 company and attritions have been around 120.

## Attrition analysis based on Over Time

In [None]:
info_of_cat('OverTime')

In [None]:
sns.countplot(x='OverTime',hue='Attrition',data=df)

#### Attrition ratio is very low for employees not doing Over Time as compared to others who are doing over time

## Analysis for Attrition based on hourly rate

In [None]:
info_of_numerical('HourlyRate')

In [None]:
sns.histplot(x=df["HourlyRate"], hue=df['Attrition'])

#### Hourly rate is having least effect on the attrition. 

## Analysis for Attrition based on Monthly Income

In [None]:
info_of_numerical('MonthlyIncome')

In [None]:
sns.histplot(x=df["MonthlyIncome"], hue=df['Attrition'],kde=True)

#### 1.Maximum no. of attrition have been observed in employees having monthly income less that 5000. 
#### 2.Attritions are almost nil or minimum for employees having monthly income more than 10000.
#### 3.Although for Employees having monthly income 3000 and 10000 show attrition ratio is 2:5 approx.

## LEVEL-3

### Study Attrition wrt to gender and monthly income

In [None]:
ymmi=[]
yfmi=[]
nmmi=[]
nfmi=[]

for i in range(0,len(df)):
    if df.loc[i,'Attrition']=='Yes':
        if df.loc[i,'Gender']=='Male':
            ymmi.append(df.loc[i,'MonthlyIncome'])
        else:
            yfmi.append(df.loc[i,'MonthlyIncome'])
    else:
        if df.loc[i,'Gender']=='Male':
            nmmi.append(df.loc[i,'MonthlyIncome'])
        else:
            nfmi.append(df.loc[i,'MonthlyIncome'])

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,5))
ax[0].set_title('Monthly Income for Male & Female Non-Attrition')



sns.histplot(x=nmmi, ax=ax[0],color='b')
sns.histplot(x=nfmi, color='g')

plt.show()

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,5))
ax[0].set_title('Monthly Income for Male Attrition')
sns.histplot(x=ymmi, ax=ax[0],color='b')
sns.histplot(x=yfmi, color='g')
plt.show()

#### Considering the Monthly Income for Attrited Males and Females chart we can say that Male attrition are observed for Monthly income range 1000-15000 wheres for femaes the range is 1000-11000

In [None]:
mmi=df[(df.Gender=='Male')]
fmi=df[(df.Gender=='Female')]
fig,ax=plt.subplots(1,2,figsize=(15,5))
ax[0].set_title('Monthly Income for Non-attrited and Attrited Females')
ax[1].set_title('Monthly Income for Non-attrited and Attrited Males')
sns.histplot(x=fmi["MonthlyIncome"], hue=fmi['Attrition'],ax=ax[0])
sns.histplot(x=mmi["MonthlyIncome"], hue=mmi['Attrition'])
plt.show()

#### 1. Only male employees are observed to get attrited having monthly income more than 10000, for females it is almost nil.
#### 2. For monthly income less than 2500 female show better stability that is lesser attrition percentage.
#### 3. For monthly income around 20000 males are observed to be lesser percentage of attritions.

In [None]:
ymi=df[(df.Attrition=='Yes')]
nmi=df[(df.Attrition=='No')]
fig,ax=plt.subplots(1,2,figsize=(15,5))
ax[0].set_title('Monthly Income for Non-attrited Males and Females')
ax[1].set_title('Monthly Income for Attrited Males and Females')
sns.histplot(x=nmi["MonthlyIncome"], hue=nmi['Gender'],ax=ax[0])
sns.histplot(x=ymi["MonthlyIncome"], hue=ymi['Gender'])

#### 1. Major attrition for both female and male is at monthly income upto 5000.

## Studying Attrition wrt to Business Travel and Monthly Income

In [None]:
df['BusinessTravel'].unique()

In [None]:
mi1=df[(df.BusinessTravel=='Travel_Rarely')]
mi2=df[(df.BusinessTravel=='Travel_Frequently')]
mi3=df[(df.BusinessTravel=='Non-Travel')]
fig,ax=plt.subplots(1,3,figsize=(15,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as pr Travel Category')
ax[0].set_title('Travel_Rarely Employees')
ax[1].set_title('Travel_Frequently Employees')
ax[2].set_title('Non-Travel Employees')
sns.histplot(x=mi1["MonthlyIncome"], hue=mi1['Attrition'],kde=True,ax=ax[0])
sns.histplot(x=mi2["MonthlyIncome"], hue=mi2['Attrition'],kde=True,ax=ax[1])
sns.histplot(x=mi3["MonthlyIncome"], hue=mi3['Attrition'],kde=True,ax=ax[2])
plt.show()

#### 1. Employees travelling frequently and having monthly income less than 5000 show high level of attritions.
#### 2. For Non-Travel employees, attritions are very low for monthly income above 5000.

## Studying Attrition wrt to Distance from home and monthly income

In [None]:
upto5kms=df[(df.DistanceFromHome<=5)]
upto10kms=df[(df.DistanceFromHome>5) & (df.DistanceFromHome<=10)]
upto20kms=df[(df.DistanceFromHome>10) & (df.DistanceFromHome<=20)]
greaterthan20kms=df[(df.DistanceFromHome>20)]

fig,ax=plt.subplots(1,4,figsize=(20,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as per Distance from Home')
ax[0].set_title('Less Than 5kms')
ax[1].set_title('6 to 10 kms')
ax[2].set_title('11 to 20kms')
ax[3].set_title('Greater Than 20kms')
sns.histplot(x=upto5kms["MonthlyIncome"], hue=upto5kms['Attrition'],ax=ax[0])
sns.histplot(x=upto10kms["MonthlyIncome"], hue=upto10kms['Attrition'],ax=ax[1])
sns.histplot(x=upto20kms["MonthlyIncome"], hue=upto20kms['Attrition'],ax=ax[2])
sns.histplot(x=greaterthan20kms["MonthlyIncome"], hue=greaterthan20kms['Attrition'],ax=ax[3])
plt.show()  

#### 1. For employees residing at the distance more than 20kms or between 11 to 20kms from company show higher attritions if monthly income is less than 10000


## Studying Attrition wrt to Department and monthly income

In [None]:
df['Department'].unique()

In [None]:
sales=df[df.Department=='Sales']
RandD=df[df.Department=='Research & Development']
hr=df[df.Department=='Human Resources']

fig,ax=plt.subplots(1,3,figsize=(15,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as per Departments')
ax[0].set_title('Sales Dept')
ax[1].set_title('Research & Development Dept')
ax[2].set_title('Human Resources Dept')
sns.histplot(x=sales["MonthlyIncome"], hue=sales['Attrition'],ax=ax[0])
sns.histplot(x=RandD["MonthlyIncome"], hue=RandD['Attrition'],ax=ax[1])
sns.histplot(x=hr["MonthlyIncome"], hue=hr['Attrition'],ax=ax[2])
plt.show()  

#### 1. For Sales department attritions are reduced after monthly income of 10000 only.
#### 2. For R&D department attritions are reduced after monthly income of 5000-7000 or higher than 7000 is nil.
#### 3. For HR department stability can be observed in the employees with mmonthly income 5000 or higher.

## Studying Attrition wrt to Work life balance and Monthly Income

In [None]:
df['WorkLifeBalance'].unique()

In [None]:
one=df[df.WorkLifeBalance==1]
two=df[df.WorkLifeBalance==2]
three=df[df.WorkLifeBalance==3]
four=df[df.WorkLifeBalance==4]

fig,ax=plt.subplots(1,4,figsize=(20,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as per Ratings for Work Life Balance')
ax[0].set_title('Rating 1')
ax[1].set_title('Rating 2')
ax[2].set_title('Rating 3')
ax[3].set_title('Rating 4')
sns.histplot(x=one["MonthlyIncome"], hue=one['Attrition'],ax=ax[0])
sns.histplot(x=two["MonthlyIncome"],hue=two['Attrition'],ax=ax[1])
sns.histplot(x=three["MonthlyIncome"], hue=three['Attrition'],ax=ax[2])
sns.histplot(x=four["MonthlyIncome"], hue=four['Attrition'],ax=ax[3])
plt.show()  

#### 1. Employees those rated '1' have higher attrition irrespective of monthly income as compared to others.
#### 2. With increase in rating attritions are observed to reduce.
#### 3. For rated '4' group of employees but monthly income less than 5000 attrition are still observed to be higher as compared to others.

##  Studying Attrition wrt to Years At Company and monthly income

In [None]:
df['YearsAtCompany'].max()

In [None]:
lessthan5=df[(df.YearsAtCompany<=5)]
lessthan15=df[(df.YearsAtCompany<=15) & (df.YearsAtCompany>5)]
lessthan25=df[(df.YearsAtCompany<=25) & (df.YearsAtCompany>15)]
lessthan40=df[(df.YearsAtCompany<40) & (df.YearsAtCompany>=25)]
fig,ax=plt.subplots(1,4,figsize=(20,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as per Years at company')

ax[0].set_title('Less Than 5 Years')
ax[1].set_title('More than 5 but not greater than 15years')
ax[2].set_title('More than 15 but not greater than 25years')
ax[3].set_title('More than 25 but not greater than 40years')

sns.histplot(x=lessthan5["MonthlyIncome"], hue=lessthan5['Attrition'],ax=ax[0])
sns.histplot(x=lessthan15["MonthlyIncome"],hue=lessthan15['Attrition'],ax=ax[1])
sns.histplot(x=lessthan25["MonthlyIncome"], hue=lessthan25['Attrition'],ax=ax[2])
sns.histplot(x=lessthan40["MonthlyIncome"], hue=lessthan40['Attrition'],ax=ax[3])

plt.show()

#### 1. Employees with less than 5 years at company and monthly income less than 5000 have high attritions and no attritions above 10000.
#### 2. Employees with 5 to 15 years at the company show attritions upto monthly income of 12500.
#### 3. Employees with 15 to 25 years at the company show attrition from monthly income of 10000 to 15000.
#### 4. Employees with more than 25 years at the company have very rare attrition.

## Studying Attrition wrt to Gender and Work Life Balance

In [None]:
male=df[(df.Gender=='Male')]
female=df[(df.Gender=='Female')]
fig,ax=plt.subplots(1,2,figsize=(15,7))
plt.suptitle('Attrition for amles and female employees wrt Work Life Balance')
ax[0].set_title('Attritions in Males wrt to work life balance')
ax[1].set_title('Attritions in Females wrt to work life balance')
sns.countplot(x=male['WorkLifeBalance'],hue=male['Attrition'],ax=ax[0])
sns.countplot(x=female['WorkLifeBalance'],hue=female['Attrition'],ax=ax[1])
plt.show()

#### 1. Employees rating '3' for Work Life Balance have less attritons.
#### 2. Employees rating '1' for Work Life Balance have high attritons.

## Studying Attrition wrt to Gender and Distance from home

In [None]:
df['DistanceFromHome'].max()

In [None]:
lessthan7kms=df[(df.DistanceFromHome<=7)]
lessthan15kms=df[(df.DistanceFromHome>7) & (df.DistanceFromHome<=15)]
lessthan25kms=df[(df.DistanceFromHome>15) & (df.DistanceFromHome<=25)]
greaterthan25kms=df[(df.DistanceFromHome>25)]

fig,ax=plt.subplots(1,4,figsize=(20,5))
ax[0].set_title("Employees residing less than 7kms")
ax[1].set_title("Employees residing in 7kms to 15kms")
ax[2].set_title("Employees residing in 16kms to 25kms")
ax[3].set_title("Employees residing greater than 25kms")

sns.countplot(x=lessthan7kms['Gender'],hue=lessthan7kms['Attrition'],ax=ax[0])
sns.countplot(x=lessthan15kms['Gender'],hue=lessthan15kms['Attrition'],ax=ax[1])
sns.countplot(x=lessthan25kms['Gender'],hue=lessthan25kms['Attrition'],ax=ax[2])
sns.countplot(x=greaterthan25kms['Gender'],hue=greaterthan25kms['Attrition'],ax=ax[3])
plt.show()


#### 1. For Employees having distance from home between 7 to 25 kms have higher attrition to non attrition ratio as compared to others for males as well as females.

## Studying Attrition wrt to Gender and Marital Status

In [None]:
df['MaritalStatus'].unique()

In [None]:
male=df.loc[(df.Gender=='Male'),['Attrition','MaritalStatus']]
female=df.loc[(df.Gender=='Female'),['Attrition','MaritalStatus']]
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title("Attrition for Males")
ax[1].set_title("Attrition for Females")

sns.countplot(x=male['MaritalStatus'],hue=male['Attrition'],ax=ax[0])
sns.countplot(x=female['MaritalStatus'],hue=female['Attrition'],ax=ax[1])

plt.show()


#### 1. Singles in males or females both have highes attrition to non attrition ratio.

## Studying Attrition wrt to Marital Status and Monthly Income

In [None]:
single=df[df.MaritalStatus=='Single']
married=df[df.MaritalStatus=='Married']
divorced=df[df.MaritalStatus=='Divorced']

fig,ax=plt.subplots(1,3,figsize=(15,5))
ax[0].set_title('Attritions for Single based on Monthly income')
ax[1].set_title('Attritions for Married based on Monthly income')
ax[2].set_title('Attritions for Divorced based on Monthly income')

sns.histplot(x=single['MonthlyIncome'],hue=single['Attrition'],ax=ax[0])
sns.histplot(x=married['MonthlyIncome'],hue=married['Attrition'],ax=ax[1])
sns.histplot(x=divorced['MonthlyIncome'],hue=divorced['Attrition'],ax=ax[2])

plt.show()

#### 1. Employees with marital status as 'Single' have high attrition to non-attrition ratio upto monthly income of 10000 range 0.5 to 0.2.
#### 2. Employees with marital status as 'Married' have low attrition to non-attrition ratio upto monthly income of 10000 range 0.3 to 0.1.
#### 3. Employees with marital status as 'Divorced' have very low attrition after 5000.

In [None]:
fig,ax=plt.subplots(1,3,figsize=(15,5))
ax[0].set_title('Attritions for Single based on Monthly income')
ax[1].set_title('Attritions for Married based on Monthly income')
ax[2].set_title('Attritions for Married based on Monthly income')

sns.boxplot(x=single['MonthlyIncome'],hue=single['Attrition'],ax=ax[0])
sns.boxplot(x=married['MonthlyIncome'],hue=married['Attrition'],ax=ax[1])
sns.boxplot(x=divorced['MonthlyIncome'],hue=divorced['Attrition'],ax=ax[2])

plt.show()

## Studying Attrition wrt to Gender and Business Travel

In [None]:
male=df[(df.Gender=='Male')]
female=df[(df.Gender=='Female')]
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title("Attrition for Males")
ax[1].set_title("Attrition for Females")
sns.countplot(x=male['BusinessTravel'],hue=male['Attrition'],ax=ax[0])
sns.countplot(x=female['BusinessTravel'],hue=female['Attrition'],ax=ax[1])

plt.show()


#### 1. For Travel frequently category for males attrition to non attrition ratio is 0.3 whereas for females is 0.4.
#### 2. For Travel Rarely category for males attrition to non attrition ratio is 0.2 whereas for females is 0.14.

## Studying Attrition wrt to Marital Status and Business Travel

In [None]:
fig,ax=plt.subplots(1,3,figsize=(15,5))
ax[0].set_title("Attrition for Single")
ax[1].set_title("Attrition for Married")
ax[2].set_title("Attrition for Divorced")

sns.countplot(x=single['BusinessTravel'],hue=single['Attrition'],ax=ax[0])
sns.countplot(x=married['BusinessTravel'],hue=married['Attrition'],ax=ax[1])
sns.countplot(x=divorced['BusinessTravel'],hue=divorced['Attrition'],ax=ax[2])

plt.show()

#### 1. For employees with marital status as 'Single' have attrition to non-attrition ratio for Travel rarely as 0.3, for Travel Frequently as 0.6 and Non Travel as 0.25.
#### 2. For employees with marital status as 'Married' have attrition to non-attrition ratio for Travel rarely as 0.13, for Travel Frequently as 0.25 and Non Travel as 0.13.
#### 3. For employees with marital status as 'Divorced' have attrition to non-attrition ratio for Travel rarely as 0.1 and for Travel Frequently as 0.2.

## Studying Attrition wrt to Gender and Years Since Last Promotion

In [None]:
male=df[(df.Gender=='Male')]
female=df[(df.Gender=='Female')]
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title("Attrition for Males")
ax[1].set_title("Attrition for Females")
sns.histplot(x=male['YearsSinceLastPromotion'],hue=male['Attrition'],ax=ax[0])
sns.histplot(x=female['YearsSinceLastPromotion'],hue=female['Attrition'],ax=ax[1])

plt.show()


#### 1. Attrition pattern for both male and female employees seem to be similar and attrition to non attrition ratio is 0.1 to 0.3.

## Studying Attrition wrt to Gender and Years at the company

In [None]:
male=df[(df.Gender=='Male')]
female=df[(df.Gender=='Female')]
fig,ax=plt.subplots(1,2,figsize=(15,7))
ax[0].set_title("Attrition for Males")
ax[1].set_tgitle("Attrition for Females")
sns.histplot(x=male['YearsAtCompany'],hue=male['Attrition'],ax=ax[0])
sns.histplot(x=female['YearsAtCompany'],hue=female['Attrition'],ax=ax[1])

plt.show()

#### 1. Attritions are observed for males and females who have been at the company for 10 or less than yaers at company.

## Studying Attrition wrt to Marital Status with Work life balance

In [None]:
fig,ax=plt.subplots(1,3,figsize=(15,5))
ax[0].set_title("Attrition for Single")
ax[1].set_title("Attrition for Married")
ax[2].set_title("Attrition for Divorced")

sns.countplot(x=single['WorkLifeBalance'],hue=single['Attrition'],ax=ax[0])
sns.countplot(x=married['WorkLifeBalance'],hue=married['Attrition'],ax=ax[1])
sns.countplot(x=divorced['WorkLifeBalance'],hue=divorced['Attrition'],ax=ax[2])

plt.show()

#### 1. As work life Blance rating increases atrition to non-attrition ratio falls down for all category of marital status.

## Studying Attrition wrt to Total Working Years and monthly income

In [None]:
df['TotalWorkingYears'].max()

In [None]:
lessthan5=df[(df.TotalWorkingYears<=5)]
lessthan15=df[(df.TotalWorkingYears<=15) & (df.TotalWorkingYears>5)]
lessthan25=df[(df.TotalWorkingYears<=25) & (df.TotalWorkingYears>15)]
greaterthan25=df[(df.TotalWorkingYears>=25)]

fig,ax=plt.subplots(1,4,figsize=(20,5))
plt.suptitle('Monthly Income for Non-attrited and Attrited Employees as per Total Working Years')

ax[0].set_title('Less Than 5 Years')
ax[1].set_title('More than 5 but not greater than 15years')
ax[2].set_title('More than 15 but not greater than 25years')
ax[3].set_title('More than 25 years')

sns.histplot(x=lessthan5["MonthlyIncome"], hue=lessthan5['Attrition'],ax=ax[0])
sns.histplot(x=lessthan15["MonthlyIncome"],hue=lessthan15['Attrition'],ax=ax[1])
sns.histplot(x=lessthan25["MonthlyIncome"], hue=lessthan25['Attrition'],ax=ax[2])
sns.histplot(x=lessthan40["MonthlyIncome"], hue=lessthan40['Attrition'],ax=ax[3])

plt.show()

#### 1. For employees with total working years less than 5years have monthly income equal to or less than 5000 and show very high attrition ratio about 0.25 to 1.0.
#### 2. For employees with total working years 5-15years have attrition to non-attrition ratio 0.1 to 0.2.
#### 3. For employees with total working years 15-25years have random attrition but very less.
#### 4. For employees with total working years more than 25years have no attritions instead for monthly income of around 10000 and 18000-20000.

## Studying Attrition wrt to Total Working Years and Business Travel

In [None]:
tr=df[(df.BusinessTravel=='Travel_Rarely')]
tf=df[(df.BusinessTravel=='Travel_Frequently')]
nt=df[(df.BusinessTravel=='Non-Travel')]

fig,ax=plt.subplots(1,3,figsize=(15,5))
ax[0].set_title("Attrition for Travel Rarely")
ax[1].set_title("Attrition for Travel Frequently")
ax[2].set_title("Attrition for Non Travel")

sns.histplot(x=tr['TotalWorkingYears'],hue=tr['Attrition'],ax=ax[0])
sns.histplot(x=tf['TotalWorkingYears'],hue=tf['Attrition'],ax=ax[1])
sns.histplot(x=nt['TotalWorkingYears'],hue=nt['Attrition'],ax=ax[2])

plt.show()

#### 1. For Travel Rarely category of employees with Total working years of 4-20years attrition to non-attrition ratio is between 0.2 to 0.3.
#### 2. For Travel Frequently category of employees with Total working years of 0-8years attrition to non-attrition ratio is between 0.3 to 0.4.
#### 3. For Travel Rarely and Travel Frequently category of employees with Total working years less than 4years attrition to non-attrition ratio is between 1 to 1.5.
#### 4. For Non-Travel category of employees with Total working years upto 10years attrition to non-attrition ratio is between 0.1 to 0.15 and after that attrition are almost nil.

#  Thank You.