In [1]:
import pandas as pd
from scipy.stats import chi2_contingency, pearsonr

In [2]:
data = pd.read_csv('general_data.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
data.dropna(inplace=True)

In [4]:
data.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [5]:
data['Attrition'] = data.Attrition.map({'Yes':1, 'No':0})

In [6]:
data.Attrition.value_counts()

0    3677
1     705
Name: Attrition, dtype: int64

In [7]:
pearsonr(data.Attrition, data.Age)

(-0.1583986795409615, 5.126598219399243e-26)

Age is negatively correlated with the Attrition rate

In [8]:
chi2_contingency(pd.crosstab(data.Attrition, data.BusinessTravel))

(70.07594084831366,
 6.07019702736392e-16,
 2,
 array([[ 375.92332268,  692.26951164, 2608.80716568],
        [  72.07667732,  132.73048836,  500.19283432]]))

Attrition rate is dependent on Business Travel

In [9]:
chi2_contingency(pd.crosstab(data.Attrition, data.Department))

(25.89432541916022,
 2.382970570769315e-06,
 2,
 array([[ 156.91442264, 2404.06321314, 1116.02236422],
        [  30.08557736,  460.93678686,  213.97763578]]))

Attrition rate is dependent on Department

In [10]:
pearsonr(data.Attrition, data.DistanceFromHome)

(-0.009448638515155983, 0.5317715668019634)

DistanceFromHome is not correlated with Attrition rate

In [11]:
chi2_contingency(pd.crosstab(data.Attrition, data.Education))

(6.2735361781270615,
 0.17963050984273224,
 4,
 array([[ 426.27019626,  706.53445915, 1427.33386581,  996.86809676,
          119.99338202],
        [  81.72980374,  135.46554085,  273.66613419,  191.13190324,
           23.00661798]]))

Attrition rate is not dependent on Education

In [12]:
chi2_contingency(pd.crosstab(data.Attrition, data.EducationField))

(43.132860148900384,
 3.472896220587872e-08,
 5,
 array([[  67.12916476, 1515.44089457,  398.57941579, 1162.17366499,
          204.74395253,  328.93290735],
        [  12.87083524,  290.55910543,   76.42058421,  222.82633501,
           39.25604747,   63.06709265]]))

Attrition rate is dependent on Education Field

In [13]:
chi2_contingency(pd.crosstab(data.Attrition, data.Gender))

(1.3825823839528295,
 0.23966176275638887,
 1,
 array([[1473.48516659, 2203.51483341],
        [ 282.51483341,  422.48516659]]))

Attrition rate is not dependent on Gender

In [14]:
chi2_contingency(pd.crosstab(data.Attrition, data.JobLevel))

(7.036480574687178,
 0.13397290697032432,
 4,
 array([[1358.52647193, 1334.1921497 ,  546.26357827,  266.83842994,
          171.17937015],
        [ 260.47352807,  255.8078503 ,  104.73642173,   51.16157006,
           32.82062985]]))

Attrition rate is not dependent on Joblevel

In [15]:
chi2_contingency(pd.crosstab(data.Attrition, data.JobRole))

(21.893724958847,
 0.005116592717526599,
 8,
 array([[326.41556367, 130.90187129, 648.63555454, 255.92994067,
         359.98014605, 198.87015062, 731.70789594, 818.13669557,
         206.42218165],
        [ 62.58443633,  25.09812871, 124.36444546,  49.07005933,
          69.01985395,  38.12984938, 140.29210406, 156.86330443,
          39.57781835]]))

Attrition rate is dependent on Job Role

In [16]:
chi2_contingency(pd.crosstab(data.Attrition, data.MaritalStatus))

(133.85785802925156,
 8.573051828219379e-30,
 2,
 array([[ 813.94112277, 1684.10292104, 1178.95595618],
        [ 156.05887723,  322.89707896,  226.04404382]]))

Attrition rate is dependent on Marital Status

In [17]:
pearsonr(data.Attrition, data.MonthlyIncome)

(-0.030160293808459582, 0.04589086274474114)

Monthly income is negatively related to artrition rate

In [18]:
pearsonr(data.Attrition, data.NumCompaniesWorked)

(0.04283056724471892, 0.004572057121624155)

Number of Companies worked is positively correlated to artrition rate

In [19]:
pearsonr(data.Attrition, data.PercentSalaryHike)

(0.033153037135465226, 0.028192446935107012)

Percent Salary Hike is positively correlated with Artrition rate

In [20]:
chi2_contingency(pd.crosstab(data.Attrition, data.StockOptionLevel))

(3.444801419724394,
 0.32799207189761653,
 3,
 array([[1577.53537198, 1490.26745778,  396.06207211,  213.13509813],
        [ 302.46462802,  285.73254222,   75.93792789,   40.86490187]]))

Attrition rate is not dependent on StockOptionLevel

In [21]:
pearsonr(data.Attrition, data.TotalWorkingYears)

(-0.16966991684723265, 1.1645434967153252e-29)

TotalWorkingYears is negatively correlated with Attrition rate

In [22]:
pearsonr(data.Attrition, data.TrainingTimesLastYear)

(-0.04758573693081555, 0.0016276603635485554)

Training Time Last year is negatively correlated with the Attrition rate

In [23]:
pearsonr(data.Attrition, data.YearsAtCompany)

(-0.1330026184252109, 9.476118084864852e-19)

Years at company is negatively correlated with the Attrition rate

In [24]:
pearsonr(data.Attrition, data.YearsSinceLastPromotion)

(-0.0314231505633079, 0.03752293607393637)

Years Since last Promotion is negatively correlated with the attrition rate

In [25]:
pearsonr(data.Attrition, data.YearsWithCurrManager)

(-0.15469153690286805, 7.105369646794048e-25)

Years with current manager is negatively correlated with Attrition rate