# Predicting Probability of Quitting and Providing Prevention Hints
In this notebook, we'll use an XGBoost model to predict the likelihood of quitting (based on the 'left' column) and identify the factors contributing to quitting to provide actionable hints for prevention.

## 1. Import Necessary Libraries

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, roc_curve, auc
from xgboost import XGBClassifier
import numpy as np

## 2. Load the Dataset

In [28]:
# Load the dataset
file_path = 'D:\\Python_Projects\\attrition_predictor\\data\\HR_Dataset.csv'
df = pd.read_csv(file_path)

# Display the first few rows and check for missing data
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [29]:
# Check for missing data and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Departments            14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


## 3. Prepare the Data

In [30]:
# Define the feature columns and the target column
X = df.drop(columns=['left'])
y = df['left']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Separate ordinal and nominal columns
ordinal_cols = ['salary']
nominal_cols = [col for col in categorical_cols if col not in ordinal_cols]

# Define the ordinal encoder for the 'salary' column
salary_categories = ['low', 'medium', 'high']
ordinal_encoder = OrdinalEncoder(categories=[salary_categories])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('ord', ordinal_encoder, ordinal_cols),
        ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols)
    ]
)

# Define the model pipeline with XGBoost
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

## 4. Apply Cross-Validation

In [31]:
# Apply cross-validation
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='roc_auc')

print("Cross-Validation ROC AUC Scores:", cv_scores)
print("Mean ROC AUC Score:", np.mean(cv_scores))

Cross-Validation ROC AUC Scores: [0.9995287  0.98498901 0.98737597 0.99940755 0.99875635]
Mean ROC AUC Score: 0.9940115147921563


## 5. Split Data and Train the Model

In [32]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

## 6. View Transformed `X_train`

In [33]:
# Transform X_train using the preprocessor
X_train_transformed = model_pipeline.named_steps['preprocessor'].transform(X_train)

# Get the feature names
feature_names_num = numerical_cols.tolist()
feature_names_ord = ordinal_cols
feature_names_nom = model_pipeline.named_steps['preprocessor'].named_transformers_['nom'].get_feature_names_out(nominal_cols)

all_feature_names = np.concatenate([feature_names_num, feature_names_ord, feature_names_nom])

# Create a DataFrame with the transformed features
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_feature_names)
X_train_transformed_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,Departments_IT,Departments_RandD,Departments_accounting,Departments_hr,Departments_management,Departments_marketing,Departments_product_mng,Departments_sales,Departments_support,Departments_technical
0,-2.062054,0.721487,2.58706,0.983762,0.340625,-0.413022,-0.148657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.175758,-0.212345,0.159755,-0.940732,-0.343557,2.421176,-0.148657,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.898169,-1.379634,-1.458448,-0.940732,-0.343557,-0.413022,-0.148657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.188797,-0.212345,0.159755,-1.381761,0.340625,-0.413022,-0.148657,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.827591,1.480225,0.968857,1.24437,1.024807,-0.413022,-0.148657,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 7. Predict Probabilities and Evaluate the Model

In [34]:
# Predict probabilities
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")
print(classification_report(y_test, model_pipeline.predict(X_test)))

# Example: Display the probabilities alongside predictions
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted_Probabilities': y_pred_proba,
    'Predicted_Label': model_pipeline.predict(X_test)
})
results.head()

ROC AUC Score: 0.9911021395356937
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3428
           1       0.99      0.95      0.97      1072

    accuracy                           0.99      4500
   macro avg       0.99      0.98      0.98      4500
weighted avg       0.99      0.99      0.99      4500



Unnamed: 0,Actual,Predicted_Probabilities,Predicted_Label
6723,0,0.000283,0
6473,0,0.000685,0
4679,0,0.000736,0
862,1,0.999067,1
7286,0,0.000767,0


## 8. Create ROC Curve Graph

In [35]:
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

  plt.show()


## 9. Analyze Feature Importance

In [36]:
# Extract feature names from the preprocessing pipeline
feature_names_num = numerical_cols.tolist()
feature_names_ord = ordinal_cols
feature_names_nom = model_pipeline.named_steps['preprocessor'].named_transformers_['nom'].get_feature_names_out(nominal_cols)

all_feature_names = np.concatenate([feature_names_num, feature_names_ord, feature_names_nom])

# Extract feature importance from the XGBoost model
feature_importances = model_pipeline.named_steps['classifier'].feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance for Quitting Prediction')
plt.show()

importance_df.head(10)

  plt.show()


Unnamed: 0,Feature,Importance
4,time_spend_company,0.254929
0,satisfaction_level,0.237916
2,number_project,0.156106
1,last_evaluation,0.078294
5,Work_accident,0.041487
3,average_montly_hours,0.039974
11,Departments_hr,0.025524
7,salary,0.023017
17,Departments_technical,0.021038
13,Departments_marketing,0.020067


## 10. Provide Recommendations Based on Feature Importance
Based on the most important features influencing the quitting decision, we can give hints to prevent quitting.

In [37]:
# Top features for recommendations
top_features = importance_df.head(5)['Feature'].values

recommendations = {
    'satisfaction_level': "Increase employee satisfaction by providing incentives, recognition, and improving work-life balance.",
    'time_spend_company': "Offer career growth opportunities and recognize achievements to retain long-term employees.",
    'number_project': "Distribute project workloads more evenly to prevent burnout.",
    'average_montly_hours': "Encourage employees to manage work hours effectively, emphasizing work-life balance.",
    'last_evaluation': "Provide constructive feedback and professional development based on performance evaluations.",
    'salary': "Offer competitive salaries to improve employee satisfaction and retention."
}

for feature in top_features:
    print(f"Feature: {feature}")
    print(f"Hint: {recommendations.get(feature, 'No specific recommendation available.')}")
    print('-' * 80)

Feature: time_spend_company
Hint: Offer career growth opportunities and recognize achievements to retain long-term employees.
--------------------------------------------------------------------------------
Feature: satisfaction_level
Hint: Increase employee satisfaction by providing incentives, recognition, and improving work-life balance.
--------------------------------------------------------------------------------
Feature: number_project
Hint: Distribute project workloads more evenly to prevent burnout.
--------------------------------------------------------------------------------
Feature: last_evaluation
Hint: Provide constructive feedback and professional development based on performance evaluations.
--------------------------------------------------------------------------------
Feature: Work_accident
Hint: No specific recommendation available.
--------------------------------------------------------------------------------
