<a href="https://colab.research.google.com/github/tanishiagr/CODING-NINJAS/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils import compute_class_weight
from joblib import dump, load

In [None]:
import xgboost as xgb
import shap

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Employee_Performance_Retention.csv to Employee_Performance_Retention.csv


In [None]:
target_task = 'classification'    # default: classification on Attrition

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['Employee_Performance_Retention.csv']))
X = df.drop(["Employee_ID", "Attrition", "Performance_Rating"], axis=1)
y_class = df["Attrition"].map({"Yes": 1, "No": 0})
y_reg = df["Performance_Rating"]

In [None]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [None]:
rf_clf = Pipeline(steps=[("pre", preprocessor),
                         ("clf", RandomForestClassifier(n_estimators=100, random_state=42))])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42, stratify=y_class)


In [None]:
t0 = time.time()
rf_clf.fit(X_train, y_train)
train_time = time.time() - t0

y_pred = rf_clf.predict(X_test)

In [None]:
print("\n--- Random Forest (Classification: Attrition) ---")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Train time (s):", round(train_time, 3))

joblib.dump(rf_clf, "rf_clf.joblib")


--- Random Forest (Classification: Attrition) ---
Accuracy : 0.8022222222222222
Precision: 0.3076923076923077
Recall   : 0.011396011396011397
F1 Score : 0.02197802197802198
Train time (s): 1.268


['rf_clf.joblib']

In [None]:
kernels = ["linear", "poly", "rbf"]
for k in kernels:
    svm_clf = Pipeline(steps=[("pre", preprocessor),
                              ("clf", SVC(kernel=k, probability=True, random_state=42))])
    t0 = time.time()
    svm_clf.fit(X_train, y_train)
    train_time = time.time() - t0
    y_pred = svm_clf.predict(X_test)
    print(f"\n--- SVM ({k} kernel) ---")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    print("Train time (s):", round(train_time, 3))
    joblib.dump(svm_clf, f"svm_{k}.joblib")


--- SVM (linear kernel) ---
Accuracy : 0.805
Precision: 0.0
Recall   : 0.0
F1 Score : 0.0
Train time (s): 5.059


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



--- SVM (poly kernel) ---
Accuracy : 0.805
Precision: 0.0
Recall   : 0.0
F1 Score : 0.0
Train time (s): 7.314


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



--- SVM (rbf kernel) ---
Accuracy : 0.805
Precision: 0.0
Recall   : 0.0
F1 Score : 0.0
Train time (s): 34.386


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
xgb_clf = Pipeline(steps=[("pre", preprocessor),
                          ("clf", xgb.XGBClassifier(
                              use_label_encoder=False,
                              eval_metric="logloss",
                              random_state=42
                          ))])

t0 = time.time()
xgb_clf.fit(X_train, y_train)
train_time = time.time() - t0
y_pred = xgb_clf.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
print("\n--- XGBoost (Classification: Attrition) ---")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Train time (s):", round(train_time, 3))

joblib.dump(xgb_clf, "xgb_clf.joblib")


--- XGBoost (Classification: Attrition) ---
Accuracy : 0.785
Precision: 0.2
Recall   : 0.03418803418803419
F1 Score : 0.058394160583941604
Train time (s): 0.169


['xgb_clf.joblib']

In [None]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

rf_reg = Pipeline(steps=[("pre", preprocessor),
                         ("reg", RandomForestRegressor(n_estimators=100, random_state=42))])
rf_reg.fit(X_train_r, y_train_r)
y_pred_r = rf_reg.predict(X_test_r)

print("\n--- Random Forest (Regression: Performance Rating) ---")
print("MSE :", mean_squared_error(y_test_r, y_pred_r))
print("RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_r)))
print("R²  :", r2_score(y_test_r, y_pred_r))
joblib.dump(rf_reg, "rf_reg.joblib")

xgb_reg = Pipeline(steps=[("pre", preprocessor),
                          ("reg", xgb.XGBRegressor(random_state=42))])
xgb_reg.fit(X_train_r, y_train_r)
y_pred_r = xgb_reg.predict(X_test_r)


--- Random Forest (Regression: Performance Rating) ---
MSE : 1.342205388888889
RMSE: 1.158535881571602
R²  : -0.05771936131317634
