# Predicting Probability of Quitting and Providing Prevention Hints

## 1. Import Necessary Libraries

In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, roc_curve, auc, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## 2. Load the Dataset

In [39]:
# Load the dataset
file_path = 'D:\\Python_Projects\\attrition_predictor\\data\\HR_Dataset.csv'
df = pd.read_csv(file_path)

# Strip trailing spaces from column names
df.columns = df.columns.str.strip()

# Check column names
print(df.columns)

# Correct column names
department_col = 'Departments'
salary_col = 'salary'

# Encode categorical features into numerical values for correlation analysis
df_encoded = df.copy()
df_encoded[department_col] = df_encoded[department_col].astype('category').cat.codes
df_encoded[salary_col] = df_encoded[salary_col].astype('category').cat.codes

# Calculate the correlation matrix
corr_matrix = df_encoded.corr(numeric_only=True)

# Display the heatmap with annotations for all cells
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True, linewidths=0.5, linecolor='white')
plt.title('Correlation Matrix with Annotations in All Cells')
plt.show()


Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Departments', 'salary'],
      dtype='object')


## 3. Distribution and Relationships

In [40]:
# Distribution of target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='left', data=df)
plt.title('Distribution of Employees Leaving')
plt.xlabel('Left Company')
plt.ylabel('Count')
plt.show()

In [41]:
# Visualize relationships between features and target
features = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()

for i, feature in enumerate(features):
    sns.boxplot(x='left', y=feature, data=df, ax=axs[i])
    axs[i].set_title(f'{feature} vs Left')
plt.tight_layout()
plt.show()

In [42]:
# Visualize categorical features
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.countplot(x='salary', hue='left', data=df, ax=axs[0])
sns.countplot(x='Departments', hue='left', data=df, ax=axs[1])
axs[0].set_title('Salary vs Left')
axs[1].set_title('Departments vs Left')
plt.tight_layout()
plt.show()

## 4. Prepare the Data

In [43]:
# Define the feature columns and the target column for classification
X_class = df.drop(columns=['left'])
y_class = df['left']

# Define the feature columns and the target column for regression
X_reg = df.drop(columns=['satisfaction_level', 'left'])
y_reg = df['satisfaction_level']

# Identify categorical and numerical columns
categorical_cols = X_class.select_dtypes(include=['object']).columns
numerical_cols = X_class.select_dtypes(include=['int64', 'float64']).columns

# Separate ordinal and nominal columns
ordinal_cols = ['salary']
nominal_cols = [col for col in categorical_cols if col not in ordinal_cols]

# Define the ordinal encoder for the 'salary' column
salary_categories = ['low', 'medium', 'high']
ordinal_encoder = OrdinalEncoder(categories=[salary_categories])

# Preprocessing pipeline for classification
preprocessor_class = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('ord', ordinal_encoder, ordinal_cols),
        ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols)
    ]
)

# Preprocessing pipeline for regression
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X_reg.select_dtypes(include=['int64', 'float64']).columns),
        ('ord', ordinal_encoder, ordinal_cols),
        ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols)
    ]
)

# Split data into training and test sets for classification
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

# Split data into training and test sets for regression
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

## 5. Resampling Techniques for Classification

In [44]:
# Function to resample data using different techniques
def resample_data(X, y, sampler):
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    print(f"Resampled dataset shape: {Counter(y_resampled)}")
    return X_resampled, y_resampled

# Apply resampling techniques
samplers = {
    'Random OverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'Borderline SMOTE': BorderlineSMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42)
}

X_class_train_transformed = preprocessor_class.fit_transform(X_class_train)

resampled_datasets = {}
for name, sampler in samplers.items():
    print(f'Applying {name}...')
    X_resampled, y_resampled = resample_data(X_class_train_transformed, y_class_train, sampler)
    resampled_datasets[name] = (X_resampled, y_resampled)
    print('-' * 40)

# Also include the original dataset
resampled_datasets['Original'] = (X_class_train_transformed, y_class_train)

Applying Random OverSampler...
Resampled dataset shape: Counter({1: 8000, 0: 8000})
----------------------------------------
Applying SMOTE...
Resampled dataset shape: Counter({1: 8000, 0: 8000})
----------------------------------------
Applying Borderline SMOTE...
Resampled dataset shape: Counter({1: 8000, 0: 8000})
----------------------------------------
Applying ADASYN...
Resampled dataset shape: Counter({0: 8000, 1: 7958})
----------------------------------------


## 6. Build Classification Models and Compare Performance

In [45]:
# Function to evaluate different classification models
def evaluate_classification_models(X_train, y_train, X_test, y_test):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }

    results = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        results[name] = roc_auc
        print(f'{name}: ROC AUC Score = {roc_auc:.3f}')
    return results

# Evaluate models for each resampling technique
model_results = {}
X_class_test_transformed = preprocessor_class.transform(X_class_test)

for name, (X_resampled, y_resampled) in resampled_datasets.items():
    print(f'### Evaluating models for {name} dataset ###')
    model_results[name] = evaluate_classification_models(X_resampled, y_resampled, X_class_test_transformed, y_class_test)
    print('-' * 50)

### Evaluating models for Random OverSampler dataset ###
Logistic Regression: ROC AUC Score = 0.825
Random Forest: ROC AUC Score = 0.991
XGBoost: ROC AUC Score = 0.991
Gradient Boosting: ROC AUC Score = 0.989
K-Nearest Neighbors: ROC AUC Score = 0.966
--------------------------------------------------
### Evaluating models for SMOTE dataset ###
Logistic Regression: ROC AUC Score = 0.825
Random Forest: ROC AUC Score = 0.990
XGBoost: ROC AUC Score = 0.992
Gradient Boosting: ROC AUC Score = 0.987
K-Nearest Neighbors: ROC AUC Score = 0.968
--------------------------------------------------
### Evaluating models for Borderline SMOTE dataset ###
Logistic Regression: ROC AUC Score = 0.782
Random Forest: ROC AUC Score = 0.991
XGBoost: ROC AUC Score = 0.992
Gradient Boosting: ROC AUC Score = 0.986
K-Nearest Neighbors: ROC AUC Score = 0.966
--------------------------------------------------
### Evaluating models for ADASYN dataset ###
Logistic Regression: ROC AUC Score = 0.796
Random Forest: ROC

## 7. Identify Best Classification Model and Resampling Technique

In [46]:
# Find the best classification model and resampling technique
best_auc = 0
best_class_model = None
best_class_sampler = None
best_class_model_name = None

for sampler, models in model_results.items():
    for model, auc in models.items():
        if auc > best_auc:
            best_auc = auc
            best_class_sampler = sampler
            best_class_model_name = model
            best_class_model = models[model]

print(f'Best Classification Model: {best_class_model_name} with {best_class_sampler} (ROC AUC Score = {best_auc:.3f})')

Best Classification Model: XGBoost with Borderline SMOTE (ROC AUC Score = 0.992)


## 8. Feature Importance Analysis for Classification

In [47]:
# Extract feature names from the preprocessing pipeline for classification
feature_names_num = numerical_cols.tolist()
feature_names_ord = ordinal_cols
feature_names_nom = preprocessor_class.named_transformers_['nom'].get_feature_names_out(nominal_cols)

all_feature_names = np.concatenate([feature_names_num, feature_names_ord, feature_names_nom])

# Use Random Forest or XGBoost feature importance
best_class_model_instance = None
if best_class_model_name == 'Random Forest':
    best_class_model_instance = RandomForestClassifier(random_state=42)
elif best_class_model_name == 'XGBoost':
    best_class_model_instance = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

pipeline_class = Pipeline([
    ('classifier', best_class_model_instance)
])

# Ensure resampled data remains a DataFrame
X_resampled, y_resampled = resampled_datasets[best_class_sampler]
X_resampled_df = pd.DataFrame(X_resampled, columns=all_feature_names)

# Fit the pipeline
pipeline_class.fit(X_resampled_df, y_resampled)

# Extract feature importance
feature_importances = pipeline_class.named_steps['classifier'].feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance Analysis (Classification)')
plt.show()

importance_df.head(10)

Unnamed: 0,Feature,Importance
4,time_spend_company,0.193511
14,Departments_product_mng,0.089148
11,Departments_hr,0.084923
0,satisfaction_level,0.082363
17,Departments_technical,0.071499
5,Work_accident,0.060376
6,promotion_last_5years,0.059872
7,salary,0.058104
2,number_project,0.054223
9,Departments_RandD,0.039811


## 9. Build Regression Models and Compare Performance

In [48]:
# Function to evaluate different regression models
def evaluate_regression_models(X_train, y_train, X_test, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'K-Nearest Neighbors': KNeighborsRegressor()
    }

    results = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'MSE': mse, 'R2': r2}
        print(f'{name}: MSE = {mse:.3f}, R2 = {r2:.3f}')
    return results

# Evaluate regression models
regression_results = evaluate_regression_models(
    preprocessor_reg.fit_transform(X_reg_train),
    y_reg_train,
    preprocessor_reg.transform(X_reg_test),
    y_reg_test
)

Linear Regression: MSE = 0.057, R2 = 0.064
Decision Tree: MSE = 0.054, R2 = 0.117
Random Forest: MSE = 0.032, R2 = 0.483
XGBoost: MSE = 0.034, R2 = 0.438
Gradient Boosting: MSE = 0.035, R2 = 0.423
K-Nearest Neighbors: MSE = 0.042, R2 = 0.316


## 10. Identify Best Regression Model

In [49]:
# Find the best regression model
best_r2 = -float('inf')
best_reg_model = None
best_reg_model_name = None

for model, scores in regression_results.items():
    if scores['R2'] > best_r2:
        best_r2 = scores['R2']
        best_reg_model_name = model
        best_reg_model = model

print(f'Best Regression Model: {best_reg_model_name} (R2 Score = {best_r2:.3f})')

Best Regression Model: Random Forest (R2 Score = 0.483)


## 11. Feature Importance Analysis for Regression

In [50]:
# Extract feature names from the preprocessing pipeline for regression
feature_names_num_reg = X_reg.select_dtypes(include=['int64', 'float64']).columns.tolist()
feature_names_ord = ordinal_cols
feature_names_nom = preprocessor_reg.named_transformers_['nom'].get_feature_names_out(nominal_cols)

all_feature_names_reg = np.concatenate([feature_names_num_reg, feature_names_ord, feature_names_nom])

# Use Random Forest or XGBoost feature importance
best_reg_model_instance = None
if best_reg_model_name == 'Random Forest':
    best_reg_model_instance = RandomForestRegressor(random_state=42)
elif best_reg_model_name == 'XGBoost':
    best_reg_model_instance = XGBRegressor(random_state=42)

pipeline_reg = Pipeline([
    ('model', best_reg_model_instance)
])

# Fit the pipeline
pipeline_reg.fit(preprocessor_reg.fit_transform(X_reg_train), y_reg_train)

# Extract feature importance
feature_importances_reg = pipeline_reg.named_steps['model'].feature_importances_

# Create a DataFrame for visualization
importance_df_reg = pd.DataFrame({'Feature': all_feature_names_reg, 'Importance': feature_importances_reg})
importance_df_reg = importance_df_reg.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df_reg)
plt.title('Feature Importance Analysis (Regression)')
plt.show()

importance_df_reg.head(10)

Unnamed: 0,Feature,Importance
1,number_project,0.364958
2,average_montly_hours,0.242742
0,last_evaluation,0.170182
3,time_spend_company,0.061357
6,salary,0.033152
4,Work_accident,0.017908
14,Departments_sales,0.017897
15,Departments_support,0.014888
16,Departments_technical,0.014834
7,Departments_IT,0.010857
