In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('processed_dataset.csv')
# Drop 'Customer Feedback' and 'Recommendation' as they are not used for prediction
df = df.drop(columns=['Customer Feedback', 'Recommendation'])

# Separate features and target variable
X = df.drop(columns=['Churn'])
y = df['Churn']

# Clean numerical columns by replacing non-numeric values with NaN and then filling with mean
numerical_cols = ['tenure in months', 'Monthly Average Balance (USD)', 'Yearly Average Balance (USD)']
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

# Standardize numerical columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X = X.fillna(0)
y = y.fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predictions
y_pred_log_reg = log_reg.predict(X_test)
y_prob_log_reg = log_reg.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg)
roc_auc_log_reg = roc_auc_score(y_test, y_prob_log_reg)

print("Logistic Regression Metrics:")
print(f"Accuracy: {accuracy_log_reg}")
print(f"Precision: {precision_log_reg}")
print(f"Recall: {recall_log_reg}")
print(f"F1-Score: {f1_log_reg}")
print(f"ROC-AUC: {roc_auc_log_reg}")


Logistic Regression Metrics:
Accuracy: 0.75
Precision: 0.5588235294117647
Recall: 0.35185185185185186
F1-Score: 0.4318181818181818
ROC-AUC: 0.8205225773718925


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Train random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_prob_rf)

print("Random Forest Metrics:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1-Score: {f1_rf}")
print(f"ROC-AUC: {roc_auc_rf}")


Random Forest Metrics:
Accuracy: 0.79
Precision: 0.6666666666666666
Recall: 0.4444444444444444
F1-Score: 0.5333333333333333
ROC-AUC: 0.837138508371385


In [6]:
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df.head(10))


                          Feature  Importance
16  Monthly Average Balance (USD)    0.176767
17   Yearly Average Balance (USD)    0.166616
4                tenure in months    0.161167
18                       Category    0.100362
14             Interest Deposited    0.096229
7                    Loan Account    0.032600
11            TechSupport Availed    0.028145
0                          Gender    0.025429
15              Paperless Banking    0.023801
6                    Credit Cards    0.021768
