<a href="https://www.kaggle.com/code/vanpatangan/bank-churn-lightgbm?scriptVersionId=177817374" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# <center> LIBRARIES </center>

In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns

import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
sample_submission_df = pd.read_csv("/kaggle/input/playground-series-s4e1/sample_submission.csv")

# <center> STATISTICAL SUMMARY </center>

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sample_submission_df.head()

In [None]:

def check(df):
    """
    Generates a concise summary of DataFrame columns.
    """
    # Use list comprehension to iterate over each column
    summary = [
        [col, df[col].dtype, df[col].count(), df[col].nunique(), df[col].isnull().sum(), df.duplicated().sum()]
        for col in df.columns
    ]

    # Create a DataFrame from the list of lists
    df_check = pd.DataFrame(summary, columns=["column", "dtype", "instances", "unique", "sum_null", "duplicates"])

    return df_check


In [None]:
check(train_df)

In [None]:
check(test_df)

# <center> EDA </center>

***Visualizing distribution***

In [None]:
# Histogram for numerical columns in Train dataset
train_df.hist(bins=50, color='#7adae6', edgecolor='black', figsize=(20,15), legend = True)
plt.show()


***Investigating the churning***

In [None]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Define columns for the histogram
columns_to_plot = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
                   'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Create a grid of subplots
fig, axes = plt.subplots(nrows=len(columns_to_plot)//2, ncols=2, figsize=(12, 18))
fig.subplots_adjust(hspace=0.8)

# Flatten the axes for easy iteration
axes = axes.flatten()

# Iterate through each column and create a histogram
for i, column in enumerate(columns_to_plot):
    sns.histplot(data=train_df, x=column, hue='Exited', multiple='stack', ax=axes[i], kde=False)
    axes[i].set_title(f'Churning by {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Count')

# Show the plot
plt.show()


***Looking for any correlation***

In [None]:
# Select featured values
train_feature = ['CreditScore', 'Age', 'Tenure', 'Balance',
                 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']

# Calculate correlation matrix
correlation_matrix = train_df[train_feature].corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Selected Features')
plt.show()

# <center> DATA PREPROCESSING </center>

In [None]:
# Label encoding categorical values 
def encoder(df):
    label_encoder = LabelEncoder()
    df['Gender'] = label_encoder.fit_transform(df['Gender'])
    df['Geography'] = label_encoder.fit_transform(df['Geography'])
    df['Surname'] = label_encoder.fit_transform(df['Surname'])
    return df 

train_df = encoder(train_df)
test_df = encoder(test_df)

In [None]:
# Separating features and target variable in the training data
X = train_df.drop(['Exited'], axis = 1)
y = train_df['Exited']

# Split the dataset into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape


In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

test_df = scaler.fit_transform(test_df)

# <center> LIGHT🤖GBM </center>

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

In [None]:
lgbParams = {
    'objective': 'binary',
     'metric': 'auc',
     'max_depth': 10,
     'learning_rate': 0.01,
     'is_unbalance': 'true',
     'boosting': 'gbdt',
     'num_leaves': 63,
     'min_child_samples': 15,
     'subsample': 0.2,
     'min_child_samples': 15,
     'colsample_bytree': 0.3,
     'feature_fraction': 0.5,
     'bagging_fraction': 0.5,
     'bagging_freq': 2,
     'verbose': -1
}

In [None]:
model_lgbm = lgb.train(lgbParams, train_data, valid_sets=valid_data, num_boost_round=5000)


In [None]:
# Make Predictions
y_train_pred = model_lgbm.predict(X_train)
y_valid_pred = model_lgbm.predict(X_valid)

# Evaluate Predictions
print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                   roc_auc_score(y_valid,y_valid_pred)))


In [None]:
# ROC Curve and AUC
def plot_roc_curve(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='#25a9b8', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='#000637', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curve(y_valid, y_valid_pred)

# <center> SUBMISSION </center>

In [None]:
sample_submission_df['Exited'] = model_lgbm.predict(test_df)
sample_submission_df.head()

In [None]:
sample_submission_df.to_csv('submission.csv', index=False)