In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load datasets
train_df= pd.read_csv('./data/train.csv')
test_df= pd.read_csv('./data/test.csv')

# Display all columns
pd.set_option('display.max_columns', None)

# One-hot encoding or label encoding
train_encoded = pd.get_dummies(train_df, drop_first=True)
test_encoded = pd.get_dummies(test_df, drop_first=True)

# create target column
train_encoded['Attrition_Yes'] = 1 - train_encoded['Attrition_Stayed']
test_encoded['Attrition_Yes'] = 1 - test_encoded['Attrition_Stayed']
train_encoded = train_encoded.drop('Attrition_Stayed', axis=1)
test_encoded = test_encoded.drop('Attrition_Stayed', axis=1)

X_train = train_encoded.drop('Attrition_Yes', axis=1)
y_train = train_encoded['Attrition_Yes']

X_test = test_encoded.drop('Attrition_Yes', axis=1)
y_test = test_encoded['Attrition_Yes']

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      7868
           1       0.73      0.76      0.74      7032

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900

[[5902 1966]
 [1716 5316]]


In [3]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)  # No scaling needed for trees
y_pred_rf = rf_model.predict(X_test)

print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Results:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      7868
           1       0.74      0.72      0.73      7032

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900

[[6101 1767]
 [1974 5058]]


In [5]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate
print("XGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

XGBoost Results:
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      7868
           1       0.73      0.74      0.74      7032

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900

[[5978 1890]
 [1843 5189]]
