# final model code for use in AWS Application #

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn libraries
from sklearn.model_selection import train_test_split


# import scaler and tree
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier




In [2]:
# read the contents of a zip file
import zipfile
with zipfile.ZipFile("hrdata.zip","r") as zip_ref:
    zip_ref.extractall("datafiles")

In [3]:
# read the contents of a csv file
df = pd.read_csv(r"datafiles\HR.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15002 entries, 0 to 15001
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15002 non-null  float64
 2   number_project         15002 non-null  int64  
 3   average_monthly_hours  15002 non-null  int64  
 4   time_spend_company     15002 non-null  int64  
 5   Work_accident          15002 non-null  int64  
 6   left                   15002 non-null  int64  
 7   promotion_last_5years  15002 non-null  int64  
 8   department             15002 non-null  object 
 9   salary                 15002 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [4]:
# review the data types of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15002 entries, 0 to 15001
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15002 non-null  float64
 2   number_project         15002 non-null  int64  
 3   average_monthly_hours  15002 non-null  int64  
 4   time_spend_company     15002 non-null  int64  
 5   Work_accident          15002 non-null  int64  
 6   left                   15002 non-null  int64  
 7   promotion_last_5years  15002 non-null  int64  
 8   department             15002 non-null  object 
 9   salary                 15002 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
# drop the rows with null values
df_drop = df.dropna()
# df_drop.info()
df = df_drop
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 15001
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15000 non-null  float64
 2   number_project         15000 non-null  int64  
 3   average_monthly_hours  15000 non-null  int64  
 4   time_spend_company     15000 non-null  int64  
 5   Work_accident          15000 non-null  int64  
 6   left                   15000 non-null  int64  
 7   promotion_last_5years  15000 non-null  int64  
 8   department             15000 non-null  object 
 9   salary                 15000 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.3+ MB


#### As seen in the feature selection, department and promotion are not meaningful in predicting the if an employee may leave and will be removed from the modeling data. ####

In [6]:
df = df.drop(['department', 'promotion_last_5years'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 15001
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15000 non-null  float64
 2   number_project         15000 non-null  int64  
 3   average_monthly_hours  15000 non-null  int64  
 4   time_spend_company     15000 non-null  int64  
 5   Work_accident          15000 non-null  int64  
 6   left                   15000 non-null  int64  
 7   salary                 15000 non-null  object 
dtypes: float64(2), int64(5), object(1)
memory usage: 1.0+ MB


In [7]:
df_dummies = pd.get_dummies(df, columns=["salary"])
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 15001
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     15000 non-null  float64
 1   last_evaluation        15000 non-null  float64
 2   number_project         15000 non-null  int64  
 3   average_monthly_hours  15000 non-null  int64  
 4   time_spend_company     15000 non-null  int64  
 5   Work_accident          15000 non-null  int64  
 6   left                   15000 non-null  int64  
 7   salary_high            15000 non-null  uint8  
 8   salary_low             15000 non-null  uint8  
 9   salary_medium          15000 non-null  uint8  
 10  salary_nme             15000 non-null  uint8  
dtypes: float64(2), int64(5), uint8(4)
memory usage: 996.1 KB


In [8]:
# remove random 1000 rows from df_dummies and copy to a new dataframe called df_dummies_later
df_dummies_later = df_dummies.sample(n=1000)


# df_dummies_later = df_dummies.drop(df_dummies.index[1000:])
df_dummies_later.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 12297 to 14587
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     1000 non-null   float64
 1   last_evaluation        1000 non-null   float64
 2   number_project         1000 non-null   int64  
 3   average_monthly_hours  1000 non-null   int64  
 4   time_spend_company     1000 non-null   int64  
 5   Work_accident          1000 non-null   int64  
 6   left                   1000 non-null   int64  
 7   salary_high            1000 non-null   uint8  
 8   salary_low             1000 non-null   uint8  
 9   salary_medium          1000 non-null   uint8  
 10  salary_nme             1000 non-null   uint8  
dtypes: float64(2), int64(5), uint8(4)
memory usage: 66.4 KB


In [9]:
# create a loop to delete each row in the df_dummies_later dataframe from the df_dummies dataframe
for index, row in df_dummies_later.iterrows():
    df_dummies = df_dummies.drop(index)


# for i in range(1000):
#     df_dummies = df_dummies.drop(df_dummies.index[0])
    

In [10]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 0 to 15001
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14000 non-null  float64
 1   last_evaluation        14000 non-null  float64
 2   number_project         14000 non-null  int64  
 3   average_monthly_hours  14000 non-null  int64  
 4   time_spend_company     14000 non-null  int64  
 5   Work_accident          14000 non-null  int64  
 6   left                   14000 non-null  int64  
 7   salary_high            14000 non-null  uint8  
 8   salary_low             14000 non-null  uint8  
 9   salary_medium          14000 non-null  uint8  
 10  salary_nme             14000 non-null  uint8  
dtypes: float64(2), int64(5), uint8(4)
memory usage: 929.7 KB


In [11]:
# create and split the training and test sets
X = df_dummies.drop(['left'], axis=1)
y = df_dummies['left']
print("Shape: ", X.shape, y.shape)

Shape:  (14000, 10) (14000,)


### create the testing and training data ###

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear and Logicstic Regression on on Non Scaled Data ###

#### Scaling the data ####

In [13]:
# scaling the X Data by using the standardScaler()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
rfc = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
# print the scores
print(f"Scaled Training Data Score: {rfc.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {rfc.score(X_test_scaled, y_test)}")

Scaled Training Data Score: 0.9998214285714285
Scaled Testing Data Score: 0.9903571428571428


In [15]:
# create test sets
X = df_dummies_later.drop(['left'], axis=1)
y = df_dummies_later['left']
print("Shape: ", X.shape, y.shape)

Shape:  (1000, 10) (1000,)


# the model code below will run for all future indidivdual or groups of employees #


In [16]:
# use the df_dummies_later data in the randomforestclassifier model
rfc = RandomForestClassifier(random_state=1, n_estimators=50).fit(X, y)
print(f"Testing Data Score: {rfc.score(X, y)}")

Testing Data Score: 0.999
