# Importing Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Making necessary imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.preprocessing import StandardScaler

# Reading and Describing Data

In [None]:
df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# Data Cleaning

In [None]:
df.isnull().sum()

No null value in the dataset

In [None]:
df.duplicated().sum()

No duplicate values

In [None]:
df.columns

In [None]:
# dropping unnecessary columns
drop_cols = ['CustomerId','Surname','RowNumber']
df.drop(drop_cols, axis=1, inplace=True)
df.head()

In [None]:
print(df['Geography'].unique())
df['Geography'].value_counts()

# Exploratory Data Analysis (EDA)

In [None]:
sns.countplot(x='Geography',hue='Exited',data=df)
plt.show()

In [None]:
sns.countplot(x='Gender',hue='Exited',data=df)
plt.show()

In [None]:
df['Exited'].value_counts().plot(kind='bar')
plt.xlabel('Exited')
plt.ylabel('Count')
plt.show()

# Encoding Categorical Features

In [None]:
df = pd.get_dummies(df, columns=['Gender','Geography'])
order = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Gender_Female',
       'Gender_Male', 'Geography_France', 'Geography_Germany', 'Geography_Spain', 'Exited']
df = df[order]
df.head()

In [None]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
print(X.shape)
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
lg = LogisticRegression()
rf = RandomForestClassifier(n_estimators=50, random_state=2)
gb = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'lg':lg,
    'rf':rf,
    'gb':gb
}

In [None]:
def train_clfs_and_predict(clfs,X_train,X_test,y_train,y_test):
    acc = []
    prec = []
    conf_mat = []

    for clf in clfs:
        model = clfs[clf]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc.append(accuracy_score(y_test,y_pred))
        prec.append(precision_score(y_test,y_pred))
        conf_mat.append(confusion_matrix(y_test,y_pred))

    return acc, prec, conf_mat

In [None]:
accuracy, precision, conf_mat = train_clfs_and_predict(clfs,X_train,X_test,y_train,y_test)

In [None]:
performance = {
    'classifiers':list(clfs.keys()),
    'accuracy':accuracy,
    'precision':precision,
    'confusion_matrix':conf_mat,
}

In [None]:
perf_df = pd.DataFrame(performance).sort_values(by='accuracy',ascending=False)
perf_df.head()

In [None]:
# Plotting confusion matrices of classifiers
num_classifiers = len(conf_mat)

fig, axes = plt.subplots(1, num_classifiers, figsize=(20, 5))  # Adjust figsize as needed

for i, (matrix, classifier) in enumerate(zip(conf_mat, list(clfs.keys()))):
    sns.set(font_scale=1)  # Adjust the font size as needed
    sns.heatmap(matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Predicted Negative", "Predicted Positive"],
                yticklabels=["Actual Negative", "Actual Positive"],
                ax=axes[i])
    axes[i].set_title(f"Confusion Matrix for {classifier}")
    axes[i].set_xlabel("Predicted Label")
    axes[i].set_ylabel("True Label")

In [None]:
sns.set(style="whitegrid")
sns.lineplot(x=perf_df.classifiers, y=perf_df.accuracy, marker='o', label='Accuracy', data=perf_df)
sns.lineplot(x=perf_df.classifiers, y=perf_df.precision, marker='o', label='Precision', data=perf_df)

plt.title("Accuracy and Precision by Classifiers")
plt.xlabel("Classifiers")
plt.ylabel("Value")
plt.legend()
plt.show()

**RESULT**

The above plot shows that svc has the best performance with ```accuracy = 86.5%``` and ```precision = 78.68%```

---

