In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [None]:
df=pd.read_csv("/kaggle/input/hr-analytics/HR_comma_sep.csv")
df

# Data exploration and visualization

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
left = df[df.left==1]
left.shape

In [None]:
retained = df[df.left==0]
retained.shape

From above table we can draw following conclusions,
<ol>
    <li>**Satisfaction Level**: Satisfaction level seems to be relatively low (0.44) in employees leaving the firm vs the retained ones (0.66)</li>
    <li>**Average Monthly Hours**: Average monthly hours are higher in employees leaving the firm (199 vs 207)</li>
    <li>**Promotion Last 5 Years**: Employees who are given promotion are likely to be retained at firm </li>
</ol>

**Impact of salary on employee retention**

In [None]:
pd.crosstab(df.salary,df.left).plot(kind='bar')

In [None]:
plt.bar(df.salary,df.left)

Above bar chart shows employees with high salaries are likely to not leave the company

**Department wise employee retention rate**

In [None]:
pd.crosstab(df.Department,df.left).plot(kind='bar')

From above chart there seem to be some impact of department on employee retention but it is not major hence we will ignore department in our analysis

<h3 style="color:purple">From the data analysis so far we can conclude that we will use following variables as independant variables in our model</h3>
<ol>
    <li>**Satisfaction Level**</li>
    <li>**Average Monthly Hours**</li>
    <li>**Promotion Last 5 Years**</li>
    <li>**Salary**</li>
</ol>

In [None]:
subdf = df[['satisfaction_level','average_montly_hours','promotion_last_5years','salary']]
subdf.head()

# Tackle salary dummy variable

Salary has all text data. It needs to be converted to numbers and we will use dummy variable for that.

In [None]:
salary_dummies = pd.get_dummies(subdf.salary, prefix="salary")

In [None]:
df_with_dummies=pd.concat([subdf,salary_dummies],axis='columns')

In [None]:
df_with_dummies

Now we need to remove salary column which is text data. It is already replaced by dummy variables so we can safely remove it

In [None]:
df_with_dummies.drop('salary',axis='columns',inplace=True)
df_with_dummies.head()

# logistic regression

In [None]:
X = df_with_dummies
X.head()

In [None]:
y = df.left

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=.3)

In [None]:
model=LogisticRegression()

In [None]:
model.fit(x_train,y_train)

In [None]:
model.predict(x_test)

In [None]:
model.score(x_test,y_test)