In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Load datasets
train_df= pd.read_csv('./data/train.csv')
test_df= pd.read_csv('./data/test.csv')

# Display all columns
pd.set_option('display.max_columns', None)

# One-hot encoding or label encoding
train_encoded = pd.get_dummies(train_df, drop_first=True)
test_encoded = pd.get_dummies(test_df, drop_first=True)

# create target column
train_encoded['Attrition_Yes'] = 1 - train_encoded['Attrition_Stayed']
test_encoded['Attrition_Yes'] = 1 - test_encoded['Attrition_Stayed']
train_encoded = train_encoded.drop('Attrition_Stayed', axis=1)
test_encoded = test_encoded.drop('Attrition_Stayed', axis=1)

X_train = train_encoded.drop('Attrition_Yes', axis=1)
y_train = train_encoded['Attrition_Yes']

X_test = test_encoded.drop('Attrition_Yes', axis=1)
y_test = test_encoded['Attrition_Yes']

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_model.fit(X_train_scaled, y_train)

# Predict
y_pred = log_model.predict(X_test_scaled)

# Evaluate
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      7868
           1       0.73      0.76      0.74      7032

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900

[[5902 1966]
 [1716 5316]]
