In [2]:
# -----------------------
# Step 0: Import packages
# ----------------------- 

# For data manipulation
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

# For data modeling
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance
# Use pip install xgboost if calls to xgboost return module not found error

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree

# For saving models
import pickle

# for displaying and modifying the working directory
import os as os


In [3]:
# -----------------------------------------------
# Step 1: Load dataset into dataframe and verify
# -----------------------------------------------

# Tell python where to find the dataset and load it to dataframe df0
os.chdir(r'D:\OneDrive\Documents\Advanced Capstone\CSV') # absolute path, using \ and r prefix
df0 = pd.read_csv("HR_comma_sep.csv")

# Display the first 10 rows of the dataframe
df0.head(10)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\OneDrive\\Documents\\Advanced Capstone\\CSV'

In [None]:
# ------------------------------------------------------------
# Step 2: Initial Exploratory Data Analysis (EDA) and Cleaning
# ------------------------------------------------------------

# Display basic information about the data 
df0.info()

In [None]:
# Display descriptive statistics about the data 
df0.describe()

In [None]:
# Display column names 
df0.columns

In [None]:
# Rename columns that are misspelled, not easy to work with or that do not follow standard naming conventions
df0 = df0.rename(columns={'Work_accident': 'work_accident',
                          'average_montly_hours': 'average_monthly_hours',
                          'time_spend_company': 'tenure',
                          'Department': 'department'})

# Display updated column names
df0.columns

In [None]:
# Check for missing values
df0.isna().sum()

In [None]:
# Check for duplicates
df0.duplicated().sum()

In [None]:
# Inspect a few of the duplicated rows
df0[df0.duplicated()].head()

In [None]:
# Remove duplicates and save results to dataframe to df1
df1 = df0.drop_duplicates(keep='first')

# Display first 10 rows of dataframe df1
df1.head(10)

In [None]:
# Create a boxplot of the distribution of `tenure` to check for outliers
plt.figure(figsize=(6,6))
plt.title('Boxplot to Detect Outliers for Tenure', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.boxplot(x=df1['tenure'])
plt.show()

In [None]:
# Determine the number of rows containing outlier values for tenure

# Calculate the 25th percentile
percentile25 = df1['tenure'].quantile(0.25)

# Calculate the 75th percentile
percentile75 = df1['tenure'].quantile(0.75)

# Calculate the interquartile range
iqr = percentile75 - percentile25

# Define the upper limit and lower limit for non-outlier values
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
print("Lower limit:", lower_limit)
print("Upper limit:", upper_limit)

# Identify subset of data containing outlier values for `tenure`
outliers = df1[(df1['tenure'] > upper_limit) | (df1['tenure'] < lower_limit)]

# Count the number of rows with outliers
print("Number of rows with outliers for `tenure`:", len(outliers))

In [None]:
# --------------------------------
# Step 3: Further Data Exploration
# --------------------------------

# Display number of people who stayed (0) and left (1)
print(df1['left'].value_counts())
print()

# Display percentage of people who stayed (0) and left (1)
print(df1['left'].value_counts(normalize=True))

In [None]:
# Display plots to help illustrate

# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (22,8))

# Create boxplot showing `average_monthly_hours` distributions for `number_project`, comparing employees who stayed versus those who left
sns.boxplot(data=df1, x='average_monthly_hours', y='number_project', hue='left', orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Monthly hours by Number of Projects', fontsize='14')

# Create histogram showing distribution of `number_project`, comparing employees who stayed versus those who left
tenure_stay = df1[df1['left']==0]['number_project']
tenure_left = df1[df1['left']==1]['number_project']
sns.histplot(data=df1, x='number_project', hue='left', multiple='dodge', shrink=2, ax=ax[1])
ax[1].set_title('Number of Projects Histogram', fontsize='14')

# Display the plots
plt.show()

In [None]:
# Display number of people who worked 7 projects stayed (0) and left (1)
df1[df1['number_project']==7]['left'].value_counts()

In [None]:
# Display a scatterplot of `average_monthly_hours` versus `satisfaction_level`, comparing employees who stayed vs employees who left
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='satisfaction_level', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'Stayed', 'Left'])
plt.title('Monthly Hours by Satisfaction Level', fontsize='14');

In [None]:
# More plots, this time looking at satisfaction level and tenure

# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (22,8))

# Display boxplot showing distributions of `satisfaction_level` by tenure, comparing employees who stayed versus those who left
sns.boxplot(data=df1, x='satisfaction_level', y='tenure', hue='left', orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Satisfaction by Tenure', fontsize='14')

# Display histogram showing distribution of `tenure`, comparing employees who stayed versus those who left
tenure_stay = df1[df1['left']==0]['tenure']
tenure_left = df1[df1['left']==1]['tenure']
sns.histplot(data=df1, x='tenure', hue='left', multiple='dodge', shrink=5, ax=ax[1])
ax[1].set_title('Tenure Histogram', fontsize='14')

plt.show();

In [None]:
# Calculate mean and median satisfaction scores for employees who left and employees who stayed
# 0 = stayed, 1 = left
df1.groupby(['left'])['satisfaction_level'].agg([np.mean,np.median])

In [None]:
# Plots of tenure vs salary

# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize = (22,8))

# Define short-tenured employees
tenure_short = df1[df1['tenure'] < 7]

# Define long-tenured employees
tenure_long = df1[df1['tenure'] > 6]

# Plot short-tenured histogram
sns.histplot(data=tenure_short, x='tenure', hue='salary', discrete=1, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.5, ax=ax[0])
ax[0].set_title('Salary Histogram by Tenure: 6 Years or Less', fontsize='14')

# Plot long-tenured histogram
sns.histplot(data=tenure_long, x='tenure', hue='salary', discrete=1, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.4, ax=ax[1])
ax[1].set_title('Salary Histogram by Tenure: 7 Years or More', fontsize='14');

In [None]:
# Display a scatterplot of `average_monthly_hours` versus `last_evaluation`
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='last_evaluation', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'Stayed', 'Left'])
plt.title('Monthly Hours by Last Evaluation Score', fontsize='14');

In [None]:
# Display plot of the relationship between `average_monthly_hours` and `promotion_last_5years`
plt.figure(figsize=(16, 3))
sns.scatterplot(data=df1, x='average_monthly_hours', y='promotion_last_5years', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'Stayed', 'Left'])
plt.title('Monthly Hours by Promotion in Last 5 Years', fontsize='14');

In [None]:
# Display counts of employees by department
df1["department"].value_counts()

In [None]:
# Display a histogram to show the distribution of employees who left and those who didn't by department
plt.figure(figsize=(11,8))
sns.histplot(data=df1, x='department', hue='left', discrete=1, 
             hue_order=[0, 1], multiple='dodge', shrink=.5)
plt.xticks(rotation=45)
plt.title('Counts of Stayed and Left by Department', fontsize=14);

In [None]:
# Display a heatmap of the correlation
corr = df1.corr(numeric_only = 1)
plt.figure(figsize=(16, 9))
heatmap = sns.heatmap(corr, vmin=None, vmax=None, annot=True, cmap=sns.color_palette("vlag", as_cmap=True))
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12);

In [None]:
# --------------------------------------------------
# Step 4: Build and Test a Logistic Regression Model
# --------------------------------------------------

# Copy the dataframe
df_enc = df1.copy()

# Encode the `salary` column as an ordinal numeric category
df_enc['salary'] = (
    df_enc['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high'])
    .cat.codes
)

# Dummy encode the `department` column
df_enc = pd.get_dummies(df_enc, drop_first=False)

# Display the new dataframe
df_enc.head()

In [None]:
# Display a heatmap to visualize how correlated variables are
plt.figure(figsize=(8, 6))
sns.heatmap(df_enc[['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'tenure']]
            .corr(), annot=True, cmap="crest")
plt.title('Heatmap of the Dataset')
plt.show()

In [None]:
# Display a stacked bar chart of number of employees by department, comparing those who left with those who stayed
# 0 = stayed, 1 = left
pd.crosstab(df1['department'], df1['left']).plot(kind ='bar',stacked=True)
plt.title('Counts of Employees Who Left Vs. Stayed by Department')
plt.ylabel('Employee count')
plt.xlabel('Department')
plt.show()

In [None]:
# Select the rows without outliers in `tenure` and save resulting dataframe in a new variable
# Logistic regression is sensitive to outliers, so best not to use them
df_logreg = df_enc[(df_enc['tenure'] >= lower_limit) & (df_enc['tenure'] <= upper_limit)]

# Display first few rows of new dataframe
df_logreg.head()

In [None]:
# Specify the outcome (variable) to predict (y)
y = df_logreg['left']

# Display first few rows of y
y.head() 

In [None]:
# Select the features (X)
X = df_logreg.drop('left', axis=1)

# Display the first few rows of X 
X.head()

In [None]:
# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [None]:
# Build logistic regression model and fit it to the training data
log_clf = LogisticRegression(random_state=42, max_iter=500).fit(X_train, y_train)

In [None]:
# Use the logistic regression model to get predictions from the test data
y_pred = log_clf.predict(X_test)

In [None]:
# Create a confusion matrix

# Calculate values for confusion matrix
log_cm = confusion_matrix(y_test, y_pred, labels=log_clf.classes_)

# Create display of confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm, 
                                  display_labels=log_clf.classes_)

# Plot confusion matrix
log_disp.plot(values_format='')

# Display plot
plt.show()

In [None]:
# How are the predictions balanced?
df_logreg['left'].value_counts(normalize=True)

In [None]:
# Create classification report for logistic regression model
target_names = ['Predicted would not leave', 'Predicted would leave']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# ---------------------------------------------------------------------------
# Step 5: Build and Test a Tree-Based Model (Decision Tree and Random Forest)
# ---------------------------------------------------------------------------

# Select the outcome (variable) to predict (y)
y = df_enc['left']

# Display the first few rows of y
y.head()

In [None]:
# Select the features (X)
X = df_enc.drop('left', axis=1)

# Display the first few rows of X
X.head()

In [None]:
# Split the data into training, validation and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

In [None]:
# Create decision tree - Round 1

# Instantiate model
tree = DecisionTreeClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
             'min_samples_leaf': [2, 5, 1],
             'min_samples_split': [2, 4, 6]
             }

# Assign a dictionary of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'}

# Instantiate GridSearch
tree1 = GridSearchCV(tree, cv_params, scoring=scoring, cv=4, refit='roc_auc')

In [None]:
# Fit the model to the training data

tree1.fit(X_train, y_train)

In [None]:
# Identify the best parameters
tree1.best_params_

In [None]:
# Identify the best area under the curve (AUC) score on CV
tree1.best_score_

In [None]:
# Create a function that will provide all the scores from the grid search

def make_results(model_name:str, model_object, metric:str):
    '''
    Arguments:
        model_name (string): what you want the model to be called in the output table
        model_object: a fit GridSearchCV object
        metric (string): precision, recall, f1, accuracy, or auc
  
    Returns a pandas df with the F1, recall, precision, accuracy, and auc scores
    for the model with the best mean 'metric' score across all validation folds.  
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'auc': 'mean_test_roc_auc',
                   'precision': 'mean_test_precision',
                   'recall': 'mean_test_recall',
                   'f1': 'mean_test_f1',
                   'accuracy': 'mean_test_accuracy'
                  }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the dataframe with the max score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract Accuracy, precision, recall, and f1 score from that row
    auc = best_estimator_results.mean_test_roc_auc
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy
  
    # Create table of results
    table = pd.DataFrame()
    table = pd.DataFrame({'model': [model_name],
                          'precision': [precision],
                          'recall': [recall],
                          'F1': [f1],
                          'accuracy': [accuracy],
                          'auc': [auc]
                        })
  
    return table

In [None]:
# Display the scores
tree1_cv_results = make_results('decision tree cv', tree1, 'auc')
tree1_cv_results

In [None]:
# Construct a random forest model

# Instantiate model
rf = RandomForestClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth': [3,5, None], 
             'max_features': [1.0],
             'max_samples': [0.7, 1.0],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'n_estimators': [300, 500],
             }  

# Assign a dictionary of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'}

# Instantiate GridSearch
rf1 = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='roc_auc')

In [None]:
# fit the model to the training data

rf1.fit(X_train, y_train)

In [None]:
# Specify where to save the model
path = 'D:\OneDrive\Documents\Advanced Capstone\Pickle\Pickle'

In [None]:
# Set up a function to save the model

def write_pickle(path, model_object, save_as:str):
    '''
    In: 
        path:         path of folder where you want to save the pickle
        model_object: a model you want to pickle
        save_as:      filename for how you want to save the model

    Out: A call to pickle the model in the folder indicated
    '''    

    with open(path + save_as + '.pickle', 'wb') as to_write:
        pickle.dump(model_object, to_write)

In [None]:
# Set up a function to load the model

def read_pickle(path, saved_model_name:str):
    '''
    In: 
        path:             path to folder where you want to read from
        saved_model_name: filename of pickled model you want to read in

    Out: 
        model: the pickled model 
    '''
    with open(path + saved_model_name + '.pickle', 'rb') as to_read:
        model = pickle.load(to_read)

    return model

In [None]:
# Save the model (write pickle)
write_pickle(path, rf1, 'hr_rf1')

In [None]:
# Load the model (read pickle)
rf1 = read_pickle(path, 'hr_rf1')

In [None]:
# Display best area under the curve (AUC) score
rf1.best_score_

In [None]:
# Display the best parameters
rf1.best_params_

In [None]:
# Display all CV scores
rf1_cv_results = make_results('random forest cv', rf1, 'auc')
print(tree1_cv_results)
print(rf1_cv_results)

In [None]:
# Define a function that will get the scores for the model's predictions

def get_scores(model_name:str, model, X_test_data, y_test_data):
    '''
    Generate a table of test scores.

    In: 
        model_name (string):  How you want your model to be named in the output table
        model:                A fit GridSearchCV object
        X_test_data:          numpy array of X_test data
        y_test_data:          numpy array of y_test data

    Out: pandas df of precision, recall, f1, accuracy, and AUC scores for your model
    '''

    preds = model.best_estimator_.predict(X_test_data)

    auc = roc_auc_score(y_test_data, preds)
    accuracy = accuracy_score(y_test_data, preds)
    precision = precision_score(y_test_data, preds)
    recall = recall_score(y_test_data, preds)
    f1 = f1_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                          'precision': [precision], 
                          'recall': [recall],
                          'f1': [f1],
                          'accuracy': [accuracy],
                          'AUC': [auc]
                         })
  
    return table

In [None]:
# Use the test data to make predictions

rf1_test_scores = get_scores('random forest1 test', rf1, X_test, y_test)
rf1_test_scores

In [None]:
# -----------------------------------------------------------------
# Step 6: Feature Engineering to Validate Random Forest Performance
# -----------------------------------------------------------------

# Drop `satisfaction_level` and save resulting dataframe in new variable
df2 = df_enc.drop('satisfaction_level', axis=1)

# Display first 10 rows of new dataframe
df2.head(10)

In [None]:
# Create `overworked` column. For the moment, it's the same as average monthly hours.
df2['overworked'] = df2['average_monthly_hours']

# Display min and max values for overworked
print('Min hours:', df2['overworked'].min())
print('Max hours:', df2['overworked'].max())

In [None]:
# Define `overworked` as working more than 175 hrs per month (avg is about 166.67)
df2['overworked'] = (df2['overworked'] > 175).astype(int)

# Display first few rows of new column
df2['overworked'].head()

In [None]:
# Drop the `average_monthly_hours` column
df2 = df2.drop('average_monthly_hours', axis=1)

# Display first few rows of resulting dataframe
df2.head()

In [None]:
# Select the outcome (variable) to predict (y)
y = df2['left']

# Select the features (X)
X = df2.drop('left', axis=1)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

In [None]:
# Creat decision tree - Round 2
tree = DecisionTreeClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
             'min_samples_leaf': [2, 5, 1],
             'min_samples_split': [2, 4, 6]
             }

# Assign a dictionary of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'}

# Instantiate GridSearch
tree2 = GridSearchCV(tree, cv_params, scoring=scoring, cv=4, refit='roc_auc')

In [None]:
# Fit the model to the training data
tree2.fit(X_train, y_train)

In [None]:
# Display the best parameters
tree2.best_params_

In [None]:
#Displ ay the best AUC score on CV
tree2.best_score_

In [None]:
# Get all CV scores
tree2_cv_results = make_results('decision tree2 cv', tree2, 'auc')
print(tree1_cv_results)
print(tree2_cv_results)

In [None]:
# Set up random forest model
rf = RandomForestClassifier(random_state=0)

# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth': [3,5, None], 
             'max_features': [1.0],
             'max_samples': [0.7, 1.0],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'n_estimators': [300, 500],
             }  

# Assign a dictionary of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'}

# Instantiate GridSearch
rf2 = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='roc_auc')

In [None]:
# Fit the model to the training data
rf2.fit(X_train, y_train)

In [None]:
# Save model (write pickle)
write_pickle(path, rf2, 'hr_rf2')

In [None]:
# Load model (read pickle)
rf2 = read_pickle(path, 'hr_rf2')

In [None]:
# Display best parameters
rf2.best_params_

In [None]:
# Display best AUC score on CV
rf2.best_score_

In [None]:
# Display all CV scores
rf2_cv_results = make_results('random forest2 cv', rf2, 'auc')
print(tree2_cv_results)
print(rf2_cv_results)

In [None]:
# Get predictions on test data
rf2_test_scores = get_scores('random forest2 test', rf2, X_test, y_test)
rf2_test_scores

In [None]:
# Generate values for confusion matrix
preds = rf2.best_estimator_.predict(X_test)
cm = confusion_matrix(y_test, preds, labels=rf2.classes_)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=rf2.classes_)
disp.plot(values_format='');

In [None]:
# Plot the tree
plt.figure(figsize=(85,20))
plot_tree(tree2.best_estimator_, max_depth=6, fontsize=14, feature_names=X.columns, 
          class_names={0:'stayed', 1:'left'}, filled=True);
plt.show()

In [None]:
# Display the feature importance for the decision tree

tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_, 
                                 columns=['gini_importance'], 
                                 index=X.columns
                                )
tree2_importances = tree2_importances.sort_values(by='gini_importance', ascending=False)

# Only extract the features with importances > 0
tree2_importances = tree2_importances[tree2_importances['gini_importance'] != 0]
tree2_importances

In [None]:
# Visualize feature importance

sns.barplot(data=tree2_importances, x="gini_importance", y=tree2_importances.index, orient='h', color='Blue')
plt.title("Decision Tree: Feature Importance in Predicting Employees Leaving", fontsize=11)
plt.ylabel("Feature")
plt.xlabel("Importance")
plt.show()

In [None]:
# Visualize random forest feature importance

# Get feature importances
feat_impt = rf2.best_estimator_.feature_importances_

# Get indices of top 10 features
ind = np.argpartition(rf2.best_estimator_.feature_importances_, -10)[-10:]

# Get column labels of top 10 features 
feat = X.columns[ind]

# Filter `feat_impt` to consist of top 10 feature importances
feat_impt = feat_impt[ind]

y_df = pd.DataFrame({"Feature":feat,"Importance":feat_impt})
y_sort_df = y_df.sort_values("Importance")
fig = plt.figure()
ax1 = fig.add_subplot(111)

y_sort_df.plot(kind='barh',ax=ax1,x="Feature",y="Importance", color='orange')

ax1.set_title("Random Forest: Feature Importance in Predicting Employees Leaving", fontsize=11)
ax1.set_ylabel("Feature")
ax1.set_xlabel("Importance")

plt.show()