# final model code for use in AWS Application #

In [173]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# sklearn libraries
from sklearn.model_selection import train_test_split


# import scaler and tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier




In [174]:
# read the contents of a zip file
import zipfile
with zipfile.ZipFile("hrdata.zip","r") as zip_ref:
    zip_ref.extractall("datafiles")

In [175]:
# read the contents of a csv file
df = pd.read_csv(r"datafiles\HR.csv")
# show rows where left = 0


In [176]:
# drop the rows with null values
df_drop = df.dropna()
# df_drop.info()
df = df_drop
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 15001
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15000 non-null  float64
 2   number_project         15000 non-null  int64  
 3   average_monthly_hours  15000 non-null  int64  
 4   time_spend_company     15000 non-null  int64  
 5   Work_accident          15000 non-null  int64  
 6   left                   15000 non-null  int64  
 7   promotion_last_5years  15000 non-null  int64  
 8   department             15000 non-null  object 
 9   salary                 15000 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.3+ MB


In [177]:
# drop the record where salary = nme
df_drop = df.drop(df[df.salary == 'nme'].index)
# df_drop.info()
df = df_drop
df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_monthly_hours  14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.3+ MB


#### As seen in the feature selection, department and promotion are not meaningful in predicting the if an employee may leave and will be removed from the modeling data. ####

In [178]:
df = df.drop(['department', 'promotion_last_5years'], axis=1)
# print 25 rows
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,salary
0,0.38,0.53,2,157,3,0,1,low
1,0.8,0.86,5,262,6,0,1,medium
2,0.11,0.88,7,272,4,0,1,medium
3,0.72,0.87,5,223,5,0,1,low
4,0.37,0.52,2,159,3,0,1,low


In [179]:
# use label encoder to coverert the salary column
le = LabelEncoder()
df['salary'] = le.fit_transform(df['salary'])
df.head()






Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,salary
0,0.38,0.53,2,157,3,0,1,1
1,0.8,0.86,5,262,6,0,1,2
2,0.11,0.88,7,272,4,0,1,2
3,0.72,0.87,5,223,5,0,1,1
4,0.37,0.52,2,159,3,0,1,1


In [180]:
# count the number of unique values in the salary column
df['salary'].value_counts()

1    7316
2    6446
0    1237
Name: salary, dtype: int64

#### Salary 1 = low, 2 = medium, 0 = high ####

In [181]:
# remove random 1000 rows from df_dummies and copy to a new dataframe called df_dummies_later
df_later = df.sample(n=1000)


# df_dummies_later = df_dummies.drop(df_dummies.index[1000:])
df_later.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 9274 to 14743
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     1000 non-null   float64
 1   last_evaluation        1000 non-null   float64
 2   number_project         1000 non-null   int64  
 3   average_monthly_hours  1000 non-null   int64  
 4   time_spend_company     1000 non-null   int64  
 5   Work_accident          1000 non-null   int64  
 6   left                   1000 non-null   int64  
 7   salary                 1000 non-null   int32  
dtypes: float64(2), int32(1), int64(5)
memory usage: 66.4 KB


In [182]:
# create a loop to delete each row in the df_dummies_later dataframe from the df_dummies dataframe
for index, row in df_later.iterrows():
    df = df.drop(index)


# for i in range(1000):
#     df_dummies = df_dummies.drop(df_dummies.index[0])
    

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13999 entries, 1 to 14998
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     13999 non-null  float64
 1   last_evaluation        13999 non-null  float64
 2   number_project         13999 non-null  int64  
 3   average_monthly_hours  13999 non-null  int64  
 4   time_spend_company     13999 non-null  int64  
 5   Work_accident          13999 non-null  int64  
 6   left                   13999 non-null  int64  
 7   salary                 13999 non-null  int32  
dtypes: float64(2), int32(1), int64(5)
memory usage: 929.6 KB


In [184]:
# create and split the training and test sets
X = df.drop(['left'], axis=1)
y = df['left']
print("Shape: ", X.shape, y.shape)

Shape:  (13999, 7) (13999,)


### create the testing and training data ###

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Scaling the data ####

In [186]:
# scaling the X Data by using the standardScaler()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [187]:
rfc = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
# print the scores
print(f"Scaled Training Data Score: {rfc.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {rfc.score(X_test_scaled, y_test)}")

Scaled Training Data Score: 0.9998214126261273
Scaled Testing Data Score: 0.9910714285714286


In [188]:
# create test sets
X = df_later.drop(['left'], axis=1)
y = df_later['left']
print("Shape: ", X.shape, y.shape)

Shape:  (1000, 7) (1000,)


# the model code below will run for all future indidivdual or groups of employees #


In [189]:
# use the df_dummies_later data in the randomforestclassifier model
rfc = RandomForestClassifier(random_state=1, n_estimators=50).fit(X, y)
print(f"Testing Data Score: {rfc.score(X, y)}")

Testing Data Score: 1.0


In [190]:
# define test variables for the function to test
satisfaction_level = .87
last_evaluation = .72
number_project = 5
average_monthly_hours = 300
time_spend_company = 5
Work_accident = 1
left = 0
Salary = 0



In [191]:
# create a dataframe with the test variables
pred_df = pd.DataFrame({'satisfaction_level': [satisfaction_level],
                        'last_evaluation': [last_evaluation],
                        'number_project': [number_project],
                        'average_monthly_hours': [average_monthly_hours],
                        'time_spend_company': [time_spend_company],
                        'Work_accident': [Work_accident],
                        'left': [left],
                        'salary': [Salary]})

pred_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,salary
0,0.87,0.72,5,300,5,1,0,0


In [192]:
# scale the test data after dropping the left column
pred_df = pred_df.drop(['left'], axis=1)
pred_df_scaled = scaler.transform(pred_df)


In [193]:
# create the testing data for the model
X = pred_df_scaled

In [194]:
# # predict the probability of leaving the company
probability = rfc.predict_proba(X)
print(probability)

# what is the datatype for probability
type(probability)

[[0.88 0.12]]




numpy.ndarray

In [141]:
# save mode as a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(rfc, file)
    
    