In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import load_iris
from sklearn.svm import SVC

# Load the dataset
data = pd.read_csv('/content/BankChurners.csv')

# Drop irrelevant columns like customer IDs
data.drop(columns=["CLIENTNUM"], inplace=True)
data = data[data.columns[:-2]]

# Label encode the target variable
label_encoder = LabelEncoder()
data['Attrition_Flag'] = label_encoder.fit_transform(data['Attrition_Flag'])

# One-hot encode the other categorical variables
data_encoded = pd.get_dummies(data.drop(columns=['Attrition_Flag']), drop_first=True)
data_encoded['Attrition_Flag'] = data['Attrition_Flag']

# Split features and target variable
X = data_encoded.drop(columns=['Attrition_Flag'])
y = data_encoded['Attrition_Flag']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Upsample only the training data using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Select top significant features using SelectKBest and chi-squared test
k_best = SelectKBest(score_func=chi2, k=5)  # Choose the number of top features you want to select
X_train_resampled_new = k_best.fit_transform(X_train_resampled, y_train_resampled)

# Get the selected feature indices
selected_feature_indices = k_best.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_feature_indices]

print("Top significant features:")
print(selected_features)

# Convert the upsampled data back to DataFrame
X_train_resampled_new = pd.DataFrame(X_train_resampled_new, columns=selected_features)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Create a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled_new, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Perform cross-validation
cv_accuracy = cross_val_score(best_model, X_train_resampled_new, y_train_resampled, cv=5, scoring='accuracy')
cv_f1 = cross_val_score(best_model, X_train_resampled_new, y_train_resampled, cv=5, scoring='f1')

print("Cross-Validation Accuracy:", cv_accuracy.mean())
print("Cross-Validation F1 Score:", cv_f1.mean())

# Transform the test data using the selected features
X_test_new = X_test[selected_features]

# Make predictions on the test set
clf_prediction = best_model.predict(X_test_new)
print(clf_prediction)
print(y_test)

# Evaluate the model
accuracy = accuracy_score(y_test, clf_prediction)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, clf_prediction)
print("F1 Score:", f1)


Top significant features:
Index(['Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy',
       'Total_Trans_Amt', 'Total_Trans_Ct'],
      dtype='object')
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}
Cross-Validation Accuracy: 0.9455968826339797
Cross-Validation F1 Score: 0.9442913426073059
[1 1 1 ... 1 1 1]
3781    1
2922    1
5070    1
7246    1
623     1
       ..
1937    1
4705    0
8548    1
1346    1
8802    1
Name: Attrition_Flag, Length: 2026, dtype: int64
Accuracy: 0.914116485686081
F1 Score: 0.9479665071770335


In [None]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

# Compute confusion matrix
rf_confusion_matrix = confusion_matrix(y_test, clf_prediction)
print("Confusion Matrix:")
print(rf_confusion_matrix)

# Create annotated heatmap
fig = ff.create_annotated_heatmap(rf_confusion_matrix, x=['Not Churn', 'Churn'], y=['Predicted Not Churn', 'Predicted Churn'], colorscale='Fall', xgap=3, ygap=3)
fig['data'][0]['showscale'] = True
fig.update_layout(title='Random Forest Model Confusion Matrix', xaxis_title='Actual', yaxis_title='Predicted')
fig.show()



Confusion Matrix:
[[ 267   60]
 [ 114 1585]]


In [None]:
# SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

# Performance metrics
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM F1 Score:", svm_f1)

In [None]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
svm_confusion_matrix = confusion_matrix(y_test, svm_predictions)
print("Confusion Matrix:")
print(svm_confusion_matrix)
fig = ff.create_annotated_heatmap(svm_confusion_matrix , x=['Not Churn','Churn'], y=['Predicted Not Churn','Predicted Churn'], colorscale='Fall',xgap=3,ygap=3)
fig['data'][0]['showscale'] = True
fig.update_layout(title='Prediction On Original Data With svm_predictions Confusion Matrix')
fig.show()

Confusion Matrix:
[[1494  230]
 [ 173 1503]]
