In [None]:
# --- All imports consolidated here ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from scipy.stats import zscore
from sklearn.feature_selection import RFE
from imblearn.combine import SMOTETomek
from collections import Counter
import joblib
import plotly.express as px
# ...existing code...

In [None]:
#read the dataset
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
#display the first 5 rows of the dataframe
df.head(5)

In [None]:
#display the summary statistics of the dataframe
df.info()

In [None]:
#display the summary statistics of the dataframe
df.describe()

<!-- Exploratory Data Analysis (EDA) is a crucial step in understanding the dataset before building any machine learning models. It helps in identifying patterns, spotting anomalies, and checking assumptions with the help of summary statistics and graphical representations. -->

In [None]:
# Count of missing values per column
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print(missing_df[missing_df['Missing Values'] > 0])  # Only show columns with missing data

In [None]:
#outlier detection using IQR method
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

outlier_summary = {}
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    outlier_summary[col] = len(outliers)

# Display number of outliers per numerical column
outlier_df = pd.DataFrame.from_dict(outlier_summary, orient='index', columns=['Outlier Count'])
print(outlier_df.sort_values(by='Outlier Count', ascending=False))

In [None]:
for col in numerical_cols:
    plt.figure(figsize=(5, 1.5))
    plt.boxplot(df[col], vert=False)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()


In [None]:
#visualization of target data relationships using matplotlib
attrition_counts = df['Attrition'].value_counts()
attrition_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title("Attrition Distribution")
plt.xlabel("Attrition")
plt.ylabel("Count")
plt.show()


In [None]:
#visualization of categorical columns
categorical_cols = ['Department', 'JobRole', 'MaritalStatus']
for col in categorical_cols:
    df[col].value_counts().plot(kind='bar')
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


In [None]:
#visualizing distribution of numerical columns
numerical_cols = ['Age', 'MonthlyIncome', 'DistanceFromHome']
for col in numerical_cols:
    plt.hist(df[col], bins=60, color='lightblue', edgecolor='black')
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
# Assuming 'Attrition' is your target
categorical_cols = df.select_dtypes(include='object').columns.drop('Attrition')
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns


In [None]:
sns.set(style="whitegrid")
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=col, hue='Attrition')
    plt.title(f'{col} vs Attrition')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


#from above plots we can see lets say overtime catogory "no" are not attriting majoritily and over18 are also majoritily not attriting 
and in marital status "single" and "married" are not attriting majoritily and in job role "sales executive" are not attriting majoritily and in department "sales" are not attriting majoritily and in age group 30-40 are not attriting majoritily and in distance from home 1-5 km are not attriting majoritily and in monthly income 5000-10000 are not attriting majoritily

In [None]:
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    for value in df['Attrition'].unique():
        sns.kdeplot(df[df['Attrition'] == value][col], label=f"Attrition: {value}", fill=True)
    plt.title(f'Distribution of {col} by Attrition')
    plt.xlabel(col)
    plt.legend()
    plt.tight_layout()
    plt.show()


A KDE (Kernel Density Estimate) plot shows the probability distribution of a continuous variable. When used with a target like Attrition, it helps compare how a feature’s values are distributed across different classes, highlighting patterns or separations.column wise we can see these are less seperable so may be combination of data will help in prediction


In [None]:
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10,8))
plt.matshow(corr, fignum=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.title("Correlation Matrix", pad=100)
plt.show()

Data Preprocessing

In [None]:
#outlier replacement using Z-score and IQR methods
from scipy.stats import zscore
# Copy the dataframe to preserve original
df_cleaned = df.copy()

for col in numerical_cols:
    # Calculate Z-scores
    z_scores = zscore(df_cleaned[col], nan_policy='omit')
    
    # Calculate IQR
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Conditions for outliers using both Z-score and IQR
    z_outliers = (z_scores > 3) | (z_scores < -3)
    iqr_outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
    
    # Final condition: outliers by both methods
    outlier_mask = z_outliers & iqr_outliers
    
    # Replace with mean
    mean_value = df_cleaned[col].mean()
    df_cleaned.loc[outlier_mask, col] = mean_value

In [None]:
# Label Encoding for categorical columns#model cannot understand categorical data, so we need to convert them into numerical format
#booosting algorithms like XGBoost, LightGBM, and CatBoost can handle categorical data directly, but for other models, we need to encode them.
# from sklearn.preprocessing import LabelEncoder

# Make a copy of the DataFrame
df_encoded = df_cleaned.copy()

# Identify categorical columns (excluding target if needed)
categorical_cols = df_encoded.select_dtypes(include='object').columns

# Initialize the encoder
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])


In [None]:
df_encoded.info()

In [None]:

# import seaborn as sns
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Correlation Matrix with Annotations")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
df_encoded.corr()

In [None]:
#using plotly for interactive visualization


# Compute correlation
# corr = `df_encoded.corr()` is calculating the correlation matrix for the DataFrame `df_encoded`. This function computes pairwise correlation of columns, excluding NA/null values, and returns a DataFrame where the rows and columns are the variables and the values are the correlation coefficients.

# Create interactive heatmap
fig = px.imshow(df_encoded.corr(),
                text_auto=True,  # show values on cells
                color_continuous_scale='RdBu_r',
                title="Correlation Matrix (Hover for Values)",
                labels=dict(color="Correlation"))

fig.update_layout(width=800, height=700)
fig.show()


<!-- As per above correlation plot, we can see that many features are having correlation with other features but with target there are no features having atleast more than +/-0.5 correlation. This indicates that the features are not strongly correlated with the target variable, which is 'Attrition' in this case. This suggests that the model may need to rely on complex interactions between features to predict attrition effectively.And also you can see yearsAtCompany and yearsInCUrrentRole are having more than 0.7 correaltion so both are conveying same information even if you drop any ome feature it is okay to reduce dimensionality without losing much information. -->

<!-- # Selecting top 20 features using RFE with RandomForestClassifier just to avoid overfitting and computational complexity -->

In [None]:

# Define independent variables (X) and target (y)
X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']

# Initialize model (Random Forest used here, you can try LogisticRegression, etc.)
model = RandomForestClassifier(random_state=42)

# Create the RFE model and select top 20 features
rfe = RFE(estimator=model, n_features_to_select=20)
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Top 20 Selected Features:")
print(selected_features)


In [None]:
x = df_encoded[selected_features]


In [None]:
# I just want to use both distance based algorithms and tree based algorithms for classifcation tasks. So performing scaling on the data basically tree based algorithms not require scaling but no issue even if you apply scaling on the data.

In [None]:

#I have already replaced outliers in the data, so I am using StandardScaler here. if you want to use RobustScaler, you can uncomment the respective lines.

scaler = StandardScaler()

# Fit and transform the numeric columns
df_scaled = x.copy()
df_scaled = scaler.fit_transform(df_scaled)


In [None]:

# Performing Train Test Split before applying SMOTETomek to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(df_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Initialize SMOTETomek with 50% sampling strategy means it make minority class samples equal to 50% of majority class samples
smt = SMOTETomek(random_state=42, sampling_strategy="auto")

# Fit and resample the training data
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)

# Check class distribution after balancing
from collections import Counter
print("Original:", Counter(y_train))
print("Resampled:", Counter(y_resampled))


In [None]:
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000,class_weight='balanced'), {
        'classifier__C': [0.1, 1, 10]
    }),
    "Random Forest": (RandomForestClassifier(class_weight='balanced'), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10]
    }),
    "SVM": (SVC(probability=True), {
        'classifier__C': [0.1, 1],
        'classifier__kernel': ['linear', 'rbf']
    }),
    "Naive Bayes": (GaussianNB(), {
        # No hyperparameters to tune for basic GaussianNB
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 10, 20]
    }),
    "KNN": (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7]
    }),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 6]
    })
}

best_model_overall = None
best_model_name = None
best_model_accuracy = 0.0  # Start with 0 accuracy

results = {}

for name, (model, param_grid) in models.items():
    pipe = Pipeline([
        ('classifier', model)
    ])

    if param_grid:
        grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        pipe.fit(X_train, y_train)
        best_model = pipe
        best_params = "Default"

    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Save result
    results[name] = {
        'accuracy': acc,
        'best_params': best_params
    }

    print(f"{name}: Accuracy = {acc:.4f}, Best Params = {best_params}")

    # Track the best model
    if acc > best_model_accuracy:
        best_model_accuracy = acc
        best_model_overall = best_model
        best_model_name = name

# === Save the best model ===
print(f"\nSaving best model: {best_model_name} with accuracy {best_model_accuracy:.4f}")
joblib.dump(best_model_overall, 'best_model.pkl')

In [None]:
results_df = pd.DataFrame(results).T
results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df

In [None]:

# --- To load the model later ---
loaded_model = joblib.load('best_model.pkl')

In [None]:

# Predict on test set
y_pred = loaded_model.predict(X_test)

# --- Print Classification Report ---
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# --- Plot Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=loaded_model.classes_,)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.grid(False)
plt.show()

In [None]:
# so for imbalanced data instead of focusing accuracy, we can focus on precision, recall, and F1-score.
# These metrics give a better understanding of the model's performance on imbalanced datasets.
# so now I will save the best model using recall of the class 1 (Attrition = Yes) as the metric.

In [None]:
best_recall = 0
best_model_name = None
best_model_obj = None
results = {}

for name, (model, param_grid) in models.items():
    pipe = Pipeline([('classifier', model)])

    if param_grid:
        grid = GridSearchCV(pipe, param_grid, cv=5, scoring='recall', refit=True)
        grid.fit(X_train, y_train)
        best_estimator = grid.best_estimator_
        best_params = grid.best_params_
    else:
        pipe.fit(X_train, y_train)
        best_estimator = pipe
        best_params = "Default"

    y_pred = best_estimator.predict(X_test)

    # Get recall for class 1
    recall_class1 = recall_score(y_test, y_pred, pos_label=1)  # or pos_label='Yes' if labels are strings
    acc = accuracy_score(y_test, y_pred)

    results[name] = {
        'accuracy': acc,
        'recall_class1': recall_class1,
        'best_params': best_params
    }

    print(f"\n{name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Recall (Attrition = Yes): {recall_class1:.4f}")
    print(f"Best Params: {best_params}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"{name} Confusion Matrix")
    plt.show()

    # Save best model based on recall of class 1
    if recall_class1 > best_recall:
        best_recall = recall_class1
        best_model_name = name
        best_model_obj = best_estimator

# === Save the best model ===
joblib.dump(best_model_obj, 'best_model.pkl')
print(f"\nBest Model Saved: {best_model_name} with Recall (Attrition = Yes) = {best_recall:.4f}")
loaded_model = joblib.load('best_model.pkl')
y_pred_loaded = loaded_model.predict(X_test)
print("Loaded Model Classification Report:")
print(classification_report(y_test, y_pred_loaded))


# Conclusion: Attrition Prediction Using Imbalanced Classification Techniques

In [None]:

# # In this project, we tackled the challenge of predicting attrition using imbalanced classification models. Whether it’s employee attrition or customer churn, the cost of missing true positives (i.e., not identifying someone who is likely to leave) is high. This makes recall a far more important metric than accuracy alone.

# # 🔍 Summary of Work:
# # We trained several models including Logistic Regression, Random Forest, and others.

# # Applied SMOTETomek to handle severe class imbalance.

# # Evaluated models using Confusion Matrix, Classification Report, and business-critical metrics like Recall, Precision, and F1-score.

# # Best-performing model was saved using joblib for future use.

# # 📉 Why Accuracy Was Misleading:
# # Although some models showed up to 85% accuracy, the true negatives were very low, indicating that many churned individuals were misclassified. This reinforced that:

# # High accuracy ≠ Good model in imbalanced classification.

# # ✅ Best Model Observations:
# # Logistic Regression gave the best recall score for the attrition class.

# # However, it still had many false negatives, which can be costly in real-world business scenarios.

# # Simply applying class balancing is not always sufficient—especially when the data is not easily separable.

# # 📌 What to Focus on in Real-World Attrition Use Cases:
# # Business Context Matters:

# # In attrition prediction, recall for the positive class (Attrition = Yes) is often more important than precision.

# # It's better to mistakenly target a non-churner than to miss a real churn risk.

# # Advanced Techniques to Improve Results:

# # Feature Engineering: Derive new features that better capture behavior leading to attrition.

# # Cost-sensitive learning: Penalize false negatives more heavily during training.

# # Model Ensemble & Threshold Tuning: Tune the decision threshold based on cost-benefit analysis or try ensemble models with different voting schemes.

# # More Data: Sometimes data quality or quantity is the limiting factor—request more samples if possible.

# # Start With EDA, Always:

# # Understand feature distributions, patterns, and imbalances.

# # Identify potential data leaks, outliers, and missing values before modeling.

# # 📦 Final Note
# # Attrition prediction is more than just a machine learning task—it’s a strategic business problem. A well-tuned model can help proactively retain employees or customers, saving significant costs and improving long-term relationships.

# # Always align model performance with business KPIs, not just technical metrics.

#If it is customer data I would have requested for more data and used more advanced techniques like feature engineering, cost-sensitive learning, and ensemble methods to improve the model's performance.

In [None]:
# # 🙌 All the Best!
# Wishing you success in your data science journey.
# Stay curious, keep building, and enjoy the ride!

# # Happy Coding & May Your Models Always Converge! 🚀🤖



In [None]:
pip freeze > requirements.txt