In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Replace '/content/drive/My Drive/path/to/your/csvfile.csv' with the actual path to your CSV file
csv_path = '/content/Employee-Attrition - Employee-Attrition.csv'

try:
  df = pd.read_csv(csv_path)
  print("CSV loaded successfully:")
  display(df.head())
except FileNotFoundError:
  print(f"Error: The file was not found at {csv_path}. Please check the path and try again.")
except Exception as e:
  print(f"An error occurred while loading the CSV: {e}")

In [None]:
print("Initial shape:", df.shape)
df.head()


In [None]:
#check for missing values
print("missing values:\n", df.isnull().sum())

In [None]:
#check for duplicates
print("\n duplicate values:", df.duplicated().sum())

In [None]:
# Strip white spaces in object (string) columns
df_obj = df.select_dtypes(include='object')
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

In [None]:
print(df.columns)

In [None]:
df.dtypes

In [None]:
# Example: Convert Attrition Yes/No to 1/0
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

In [None]:
#converting overtime to yes/no to 1/0
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})


In [None]:

#drop columns that are useless
drop_cols=['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber', 'monthlyrate','HourlyRate', 'PerformanceRating']
df= df.drop(columns=drop_cols, errors='ignore')

In [None]:

#renaming columns
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [None]:
print(df.columns)

In [None]:

#cleaned dataset saving

#Save to CSV (without the index column)
df.to_csv('cleaned_employee_attrition.csv', index=False)

In [None]:

from google.colab import files
files.download('cleaned_employee_attrition.csv')

In [None]:

print(df['attrition'].value_counts())

In [None]:

df.shape


[EDA PART]

In [None]:
# Data structure
df.info()


In [None]:
# Summary statistics
df.describe()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#plot attrition distribution
plt.figure(figsize=(5, 4))
sns.countplot(x='attrition', data=df)
plt.title("Attrition count (0= stay, 1=left)")
plt.show()

#Check proportion
print(df['attrition'].value_counts(normalize=True))

In [None]:
#correlation by heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:

#key factors vs attrition
plt.figure(figsize=(10,4))
sns.countplot(x='jobrole', hue='attrition', data=df)
plt.title("Attrition by Job Role")
plt.xticks(rotation=45)
plt.show()

In [None]:

plt.figure(figsize=(8,5))
sns.boxplot(x='attrition', y='joblevel', data=df)
plt.title("joblevel vs Attrition")
plt.show()


In [None]:
#Create tenure category
def tenure_band(x):
    if x < 3:
        return '0-2 yrs'
    elif x < 6:
        return '3-5 yrs'
    elif x < 10:
        return '6-9 yrs'
    else:
        return '10+ yrs'

df['tenurecategory'] = df['yearsatcompany'].apply(tenure_band)


#Engagement Score
df['engagementscore'] = (
    df['jobsatisfaction'] +
    df['environmentsatisfaction'] +
    df['relationshipsatisfaction'] +
    df['worklifebalance'] +
    df['jobinvolvement']
) / 5


#Overtime Stress Feature
import numpy as np

df['overtimestress'] = np.where(df['overtime'] == 'Yes', 5 - df['jobsatisfaction'], 0)


#Income per Year / Experience Ratio
import numpy as np
df['incomeperyearworked'] = df['monthlyincome'] * 12 / (df['totalworkingyears'].replace(0, np.nan))
df['incomeperyearworked'] = df['incomeperyearworked'].fillna(df['incomeperyearworked'].median())


# Promotion Gap Risk

# Captures stagnation effect.
df['promotiongap'] = df['yearssincelastpromotion'] / (df['yearsatcompany'] + 1)

ENCODING PART

In [None]:
print(df.columns)


In [None]:
print(df.dtypes)

One-Hot Encoding (for remaining categorical columns)



In [None]:
df = pd.get_dummies(
    df,
    columns=['department', 'jobrole', 'businesstravel',
              'maritalstatus', 'educationfield', 'tenurecategory', 'gender'],
    drop_first=True
)

In [None]:

print("After encoding:", df.shape)
print(df.head(5))

In [None]:

df.dtypes.value_counts()

In [None]:

df = df.astype({col: int for col in df.select_dtypes('bool').columns})


df.dtypes.value_counts()

In [None]:
import os, joblib, pickle, json, pandas as pd, numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve

# Create export directory
os.makedirs("export", exist_ok=True)

# Save the final cleaned dataset before splitting
df.to_csv("export/cleaned_employee_attrition_final.csv", index=False)

print("âœ… Cleaned dataset saved as 'cleaned_employee_attrition_final.csv'")
print("Shape:", df.shape)

Train/Test Split and Model Training

In [None]:
# -------------------------------
# Train / Test Split
# -------------------------------
from sklearn.model_selection import train_test_split

X = df.drop(columns=['attrition'])
y = df['attrition']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Random Forest Model Training
# -------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    class_weight={0:1, 1:5},
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# -------------------------------
# Model Evaluation
# -------------------------------
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]
threshold = 0.35
y_pred = (y_prob >= threshold).astype(int)

print("âœ… Accuracy:", accuracy_score(y_test, y_pred))
print("âœ… ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, auc

# Confusion Matrix Heatmap
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_prob):.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# Feature Importance
import pandas as pd
fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)[:15]
plt.figure(figsize=(8,5))
sns.barplot(x=fi, y=fi.index, palette="viridis")
plt.title("Top 15 Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:
import os, joblib, pickle, json, pandas as pd, numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve

# Create export directory
os.makedirs("export", exist_ok=True)

# --- Compute tuned threshold if you wish ---
prec, rec, th = precision_recall_curve(y_test, y_prob)
f1s = 2*prec*rec/(prec+rec+1e-9)
best_idx = np.argmax(f1s[:-1])
best_threshold = float(th[best_idx])
print(f"ðŸ”¹ Best F1 threshold = {best_threshold:.3f}")
# keep manual 0.35 if preferred
best_threshold = 0.35

# --- Save Model ---
joblib.dump(rf, "export/random_forest_model.joblib")

# --- Save Feature Columns ---
with open("export/feature_columns.pkl", "wb") as f:
    pickle.dump(list(X.columns), f)

# --- Save Metadata ---
meta = {
    "threshold": best_threshold,
    "auc": roc_auc_score(y_test, y_prob),
    "n_features": len(X.columns),
}
with open("export/metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

# --- Optional: Save Feature Importance Plot ---
fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(x=fi.head(15), y=fi.head(15).index, palette="viridis")
plt.title("Top 15 Feature Importances")
plt.tight_layout()
plt.savefig("export/feature_importance.png", dpi=160)
plt.close()

# --- Optional: Save Confusion Matrix ---
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("export/confusion_matrix.png", dpi=160)
plt.close()

print("\nâœ… Model and artifacts saved successfully in '/export/' folder:")
print(" - random_forest_model.joblib")
print(" - feature_columns.pkl")
print(" - metadata.json")
print(" - feature_importance.png")
print(" - confusion_matrix.png")


In [None]:
import pandas as pd
import numpy as np

# Assuming rf_model is your trained RandomForest
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

# Show top features
top_features = importances.head(15)
print(top_features)


In [None]:
important_features = top_features.index.tolist()
X = df[important_features + ['attrition']]  # keep target for EDA


In [None]:

X.to_csv("cleaned_employee_attrition_top_features.csv", index=False)