# Binary Classification with a Bank Churn Dataset 

## Goal: For this Episode of the Series, your task is to predict whether a customer continues with their account or closes it (e.g., churns). Good luck!

### Evaluation: Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.
- Utilize ROC/AUC
- Target = Excited

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")
test_df = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [None]:
df

In [None]:
test_df

In [None]:
target = df['Exited']
df = df.drop('Exited', axis=1)

## Exploratory Data Analysis

In [None]:
column_names = list(df.columns)
column_names

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
# 165034 Rows, 14 Columns, 12 Numerical Columns, 2 Non-Numerical Columns
df.info()

In [None]:
numeric_columns = df.select_dtypes(include='number').columns.tolist()
numeric_columns

In [None]:
non_numeric_columns = df.select_dtypes(exclude='number').columns.tolist()
non_numeric_columns

In [None]:
# There are no missing Data
df.isnull().sum()

In [None]:
# Need to Normalize due to large distribution vs that of other column data
print(f"Min Credit Score: {df['CreditScore'].min()}, Max Credit Score: {df['CreditScore'].max()}")
print(f"Min Balance: {df['Balance'].min()}, Max Balance: {df['Balance'].max()}")

In [None]:
# Heavily lower bound weighted,anything above 75th percentile is within 130k and 250k.
# Median - 0

plt.boxplot(df['Balance'])
plt.show()

In [None]:
# balance_bins = [0, 50000, 100000, 150000, 200000, 300000]
# balance_labels = ['50000', '100000', '150000','200000', '250000+']
# df['Balance_category'] = pd.cut(df['Balance'], bins=balance_bins, labels=balance_labels, include_lowest=True)

In [None]:
# df['Balance_category'].unique()

In [None]:
# Categorizing the Variable into bins allowed us to see that there is a large skew towards the left
sns.histplot(df['Balance'])

In [None]:
print(f"Balance of 0: {df[df['Balance'] == 0]['Balance'].count()}")
print(f"Balance greater than 0: {df[df['Balance'] > 0]['Balance'].count()}")

### Creating two different categories for Balance: 0 - Those without balance, 1 - Those with Balance
Reasoning: Accounts with balance of 0 creates a skew, having 89648 and those with a balance at 75386.

In [None]:
df['Balance'] = df['Balance'].map(lambda balance: 1 if balance > 0 else balance)

In [None]:
df['Balance'].unique()

In [None]:
df['Balance']

In [None]:
# # rows of sub plots 
# num_rows = len(numeric_columns) // 2 + len(numeric_columns) % 2

# plt.figure(figsize=(15, 5 * num_rows))
# for i, column in enumerate(numeric_columns, 1):
#     plt.subplot(num_rows, 2, i)
#     sns.histplot(df[column], kde=True, bins=30)
#     plt.title(f'Distribution of {column}')

# plt.tight_layout()
# plt.show()

In [None]:
df[numeric_columns].agg(['skew', 'kurtosis']).transpose()

In [None]:
df['Age'] = np.log(df['Age'])
df['Age']

In [None]:
sns.histplot(df['Age'])

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
def pre_process(dataframe):
    le = LabelEncoder()
    scaler = StandardScaler()
    dataframe.drop(['id', 'Surname', 'CustomerId'], axis=1, inplace=True)
    dataframe['Age'] = np.log(dataframe['Age'])
    dataframe['Gender'] = le.fit_transform(dataframe['Gender'])
    dataframe['Geography'] = le.fit_transform(dataframe['Geography'])
    new_df = scaler.fit_transform(dataframe)
    return new_df

In [None]:
df = pre_process(df)

In [None]:
test_df = pre_process(test_df)

In [None]:
df

In [None]:
test_df

# Frequently used Functions

In [None]:
# Learning Curve and Plot
from sklearn.model_selection import learning_curve
def model_learning_curve(model,X_train,y_train,title):
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(train_sizes, train_mean, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2)
    plt.plot(train_sizes, test_mean, label='Validation Accuracy')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2)
    plt.grid(True)
    plt.title(f'{title} Learning Curve')
    plt.xlabel('Training Size')
    plt.ylabel('Accuracy')
    plt.legend()

In [None]:
# Validation Curve and Plot
from sklearn.model_selection import validation_curve
def model_validation_curve(model, X_train, y_train, param, param_range, label):
    param_range = param_range

    train_scores, test_scores = validation_curve(model, X_train, y_train, param_name=param, param_range=param_range, cv=3, scoring='accuracy', n_jobs=-1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(param_range, train_mean, label='Training Accuracy')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2)
    plt.plot(param_range, test_mean, label='Validation Accuracy')
    plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2)
    plt.grid(True)
    plt.title('Validation Curve')
    plt.xlabel(label)
    plt.ylabel('Accuracy')
    plt.legend()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, shuffle=True, stratify=target, random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


In [None]:
# # Get a subset of the training data for manual tuning
# subset_size = 50000
# subset_indices = np.random.choice(X_train.shape[0], size=subset_size, replace=False)
# X_train_subset = X_train[subset_indices]
# y_train_subset = y_train[subset_indices]

In [None]:
# print(f'X_train_subset: {X_train_subset.shape}')
# print(f'y_train_subset: {y_train_subset.shape}')

# Machine Learning Models
Models used:
- Random Forest
- Adaboost
- XGBoost
- CATBoost

## Random Forest Manual Hyperparamter Tuning

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier 

In [None]:
# rf = RandomForestClassifier(n_jobs=-1,random_state=42)

In [None]:
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# rf = RandomForestClassifier(n_jobs=-1,random_state=42)

In [None]:
# model_validation_curve(rf, X_train_subset, y_train_subset, param="max_depth", param_range=np.arange(1,20), label="MAX_DEPTH")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_jobs=-1,random_state=42)
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_jobs=-1,random_state=42)
# model_validation_curve(rf, X_train_subset, y_train_subset, param="n_estimators", param_range=np.arange(1, 40), label="n_estimators")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14, n_jobs=-1,random_state=42)
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14, n_jobs=-1,random_state=42)
# model_validation_curve(rf, X_train_subset, y_train_subset, param="max_leaf_nodes", param_range=np.arange(50, 100), label="max_leaf_nodes")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, n_jobs=-1,random_state=42)
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14, n_jobs=-1,random_state=42)
# model_validation_curve(rf, X_train_subset, y_train_subset, param="min_samples_split", param_range=np.arange(1, 50), label="min_samples_split")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, min_samples_split=7, n_jobs=-1,random_state=42)
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, min_samples_split=7, n_jobs=-1,random_state=42)
# model_validation_curve(rf, X_train_subset, y_train_subset, param="min_samples_leaf", param_range=np.arange(1, 10), label="min_samples_leaf")

In [None]:
# rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, min_samples_split=7,min_samples_leaf=3, n_jobs=-1,random_state=42)
# model_learning_curve(rf, X_train_subset, y_train_subset, "RandomForestClassifier Baseline")

In [None]:
# # Cross Validation on entire Dataset (X_train, y_train)
# rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, min_samples_split=7,min_samples_leaf=3, n_jobs=-1,random_state=42)
# print(cross_val_score(rf, X_train, y_train, cv=3))

# Validation

In [None]:
rf = RandomForestClassifier(max_depth=8, n_estimators=14,max_leaf_nodes=1000, min_samples_split=7,min_samples_leaf=3, n_jobs=-1,random_state=42)
rf.fit(X_train, y_train)

In [None]:
print("ROC:", roc_auc_score(rf.predict(X_test), y_test))

# GridSearchCV

In [None]:
# from sklearn.model_selection import GridSearchCV
# import time

# # Initialize RandomForest
# rf = RandomForestClassifier(n_jobs=-1, random_state=42)

# # Parameter grid
# param_grid = {
#     'n_estimators': [300, 400],
#     'max_depth': [10, 12],
#     'max_leaf_nodes': [200, 300],
#     'min_samples_split': [15, 17],
#     'min_samples_leaf': [3, 5]
# }
# # Start the timer
# start_time = time.time()

# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3)
# grid_search.fit(X_train, y_train)

# # Stop the timer
# end_time = time.time()

# # Elapsed time
# elapsed_time = end_time - start_time
# print(f"Elapsed time: {elapsed_time} seconds")

# # Get the best parameters
# best_params = grid_search.best_params_
# print(best_params)

In [None]:
best_params = {'max_depth': 12, 'max_leaf_nodes': 300, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 400}

In [None]:
rf = RandomForestClassifier(**best_params, random_state=42)

In [None]:
print(cross_val_score(rf, X_train, y_train, cv=3))

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# from sklearn.model_selection import GridSearchCV
# import time
# # Grid Search for Ada
# ada = AdaBoostClassifier(random_state=42)
# # Define the parameter grid
# param_grid = {
#     'n_estimators': [150, 200, 250],
#     'learning_rate': [0.8, 0.9, 0.95]
# }
# # Start the timer
# start_time = time.time()

# grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=3)
# grid_search.fit(X_train, y_train)

# # Stop the timer
# end_time = time.time()

# # Calculate the elapsed time
# elapsed_time = end_time - start_time

# # Print the elapsed time
# print(f"Elapsed time: {elapsed_time} seconds")

# # Get the best parameters
# best_params = grid_search.best_params_
# print(best_params)

In [None]:
ada = AdaBoostClassifier(learning_rate=0.95, n_estimators=200, random_state=42)
print(cross_val_score(ada, X_train, y_train, cv=3))

In [None]:
ada.fit(X_train, y_train)
y_pred = ada.predict_proba(X_test)[:,1]
fpr, tpr,thresholds = roc_curve(y_test, y_pred)
auc(fpr, tpr)

In [None]:
roc_auc = roc_auc_score(y_test, y_pred)


plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Data')
plt.legend()

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
rf_best_params = {'max_depth': 12, 'max_leaf_nodes': 300, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 400}

In [None]:
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(**rf_best_params, random_state=42)),
        ('ada', AdaBoostClassifier(learning_rate=0.95, n_estimators=200, random_state=42))
    ]
)

voting_clf.voting = "soft"
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

In [None]:
y_pred = voting_clf.predict_proba(X_test)[:,1]
fpr, tpr,thresholds = roc_curve(y_test, y_pred)
auc(fpr, tpr)

In [None]:
roc_auc = roc_auc_score(y_test, y_pred)


plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Data')
plt.legend()

In [None]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test,y_test))

# Prediction

In [None]:
voting_clf.fit(X_train, y_train)

In [None]:
y_pred = voting_clf.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr,thresholds = roc_curve(y_test, y_pred)
auc(fpr, tpr)

In [None]:
roc_auc = roc_auc_score(y_test, y_pred)


plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Data')
plt.legend()

## Classification Report

In [None]:
# classification_report
y_pred = voting_clf.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
df_null = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")

In [None]:
df_null['Exited'].value_counts()

# Submission

In [None]:
y_pred_sub = voting_clf.predict_proba(test_df)[:,1]

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s4e1/sample_submission.csv')

In [None]:
submission['Exited'] = y_pred_sub

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission