### Churn Prediction



In [None]:
# Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE


In [None]:
#Loading the dataset
data = pd.read_csv('Churn_Modelling.csv')

In [None]:
# Preprocessing the Dataset
# Handling missing values if any
data = data.dropna()

# Encoding categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Defining features and target
X = data.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited','Gender'])
y = data['Exited']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Balancing the Dataset

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
#Finding the correlations between the features
# Defining features and target
X = data.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])

# Computing the correlation matrix
correlation_matrix = X.corr()

# Printing the correlation matrix
print("Correlation Matrix:\n", correlation_matrix)

# If you want to find the correlations between each pair of features separately:
for col1 in correlation_matrix.columns:
    for col2 in correlation_matrix.columns:
        if col1 != col2:
            correlation_value = correlation_matrix.loc[col1, col2]
            print(f"Correlation between {col1} and {col2}: {correlation_value}")


Correlation Matrix:
                  CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.000000   0.007888 -0.002857 -0.003965  0.000842   
Geography           0.007888   1.000000  0.004719  0.022812  0.003739   
Gender             -0.002857   0.004719  1.000000 -0.027544  0.014733   
Age                -0.003965   0.022812 -0.027544  1.000000 -0.009997   
Tenure              0.000842   0.003739  0.014733 -0.009997  1.000000   
Balance             0.006268   0.069408  0.012087  0.028308 -0.012254   
NumOfProducts       0.012238   0.003972 -0.021859 -0.030680  0.013444   
HasCrCard          -0.005458  -0.008523  0.005766 -0.011721  0.022583   
IsActiveMember      0.025651   0.006724  0.022544  0.085472 -0.028362   
EstimatedSalary    -0.001384  -0.001369 -0.008112 -0.007201  0.007784   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      0.006268       0.012238  -0.005458        0.025651   
Geography        0.069408       0

In [None]:
# Adding target variable to the features dataframe
X['Exited'] = y

# Compute the correlation matrix
correlation_matrix = X.corr()

# Print the correlation matrix
print("Correlation Matrix:\n", correlation_matrix)

# Print correlations between features and the target variable
target_correlations = correlation_matrix['Exited'].drop('Exited')  # Drop the correlation of target with itself
print("\nCorrelations with target variable (Exited):\n", target_correlations)


Correlation Matrix:
                  CreditScore  Geography    Gender       Age    Tenure  \
CreditScore         1.000000   0.007888 -0.002857 -0.003965  0.000842   
Geography           0.007888   1.000000  0.004719  0.022812  0.003739   
Gender             -0.002857   0.004719  1.000000 -0.027544  0.014733   
Age                -0.003965   0.022812 -0.027544  1.000000 -0.009997   
Tenure              0.000842   0.003739  0.014733 -0.009997  1.000000   
Balance             0.006268   0.069408  0.012087  0.028308 -0.012254   
NumOfProducts       0.012238   0.003972 -0.021859 -0.030680  0.013444   
HasCrCard          -0.005458  -0.008523  0.005766 -0.011721  0.022583   
IsActiveMember      0.025651   0.006724  0.022544  0.085472 -0.028362   
EstimatedSalary    -0.001384  -0.001369 -0.008112 -0.007201  0.007784   
Exited             -0.027094   0.035943 -0.106512  0.285323 -0.014001   

                  Balance  NumOfProducts  HasCrCard  IsActiveMember  \
CreditScore      0.006268      

In [None]:
# Various Models

In [None]:
# Logistic Regression

# Training the model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
#lr_model.fit(X_train_res, y_train_res)

# Predicting and evaluating
y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))
print(f'ROC AUC: {roc_auc_score(y_test, y_pred_lr)}')


              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1607
           1       0.54      0.15      0.23       393

    accuracy                           0.81      2000
   macro avg       0.68      0.56      0.56      2000
weighted avg       0.77      0.81      0.76      2000

ROC AUC: 0.5591955360691377


In [None]:
# Random Forest
# Training the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
#rf_model.fit(X_train_res, y_train_res)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(f'ROC AUC: {roc_auc_score(y_test, y_pred_rf)}')


              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000

ROC AUC: 0.7125726980085535


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
print(grid_search.best_params_)





{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}


In [None]:
# Predicting and evaluating the tuned Random Forest model
y_pred_rf = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(f'ROC AUC: {roc_auc_score(y_test, y_pred_rf)}')

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.45      0.57       393

    accuracy                           0.87      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC: 0.7102838883953948


In [None]:
# Gradient Boosting
# Training the model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Predicting and evaluating
y_pred_gb = gb_model.predict(X_test)
print(classification_report(y_test, y_pred_gb))
print(f'ROC AUC: {roc_auc_score(y_test, y_pred_gb)}')


              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.45      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000

ROC AUC: 0.7087281945559424


In [None]:
# SVM
# Training the model
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

# Predicting and evaluating
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))
print(f'ROC AUC: {roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1])}')


              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1607
           1       0.80      0.38      0.51       393

    accuracy                           0.86      2000
   macro avg       0.83      0.68      0.72      2000
weighted avg       0.85      0.86      0.84      2000

ROC AUC: 0.8226572359160227


In [None]:
#KNN checking for 13 neighbours

for i in range(1,30):
    # Training the model
    print(i,end=" ")
    print("Neighbours")
    knn_model = KNeighborsClassifier(n_neighbors=i)
    knn_model.fit(X_train, y_train)

    # Predicting and evaluating
    y_pred_knn = knn_model.predict(X_test)
    print(classification_report(y_test, y_pred_knn))
    print(f'ROC AUC: {roc_auc_score(y_test, knn_model.predict_proba(X_test)[:, 1])}')


1 Neighbours
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1607
           1       0.49      0.52      0.50       393

    accuracy                           0.80      2000
   macro avg       0.68      0.69      0.69      2000
weighted avg       0.80      0.80      0.80      2000

ROC AUC: 0.6916860237732186
2 Neighbours
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      1607
           1       0.70      0.35      0.47       393

    accuracy                           0.84      2000
   macro avg       0.78      0.66      0.69      2000
weighted avg       0.83      0.84      0.82      2000

ROC AUC: 0.7323676156003235
3 Neighbours
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      1607
           1       0.60      0.47      0.53       393

    accuracy                           0.83      2000
   macro avg       0.74      0.70

In [None]:
# Naive Bayes
# Training the model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predicting and evaluating
y_pred_nb = nb_model.predict(X_test)
print(classification_report(y_test, y_pred_nb))
print(f'ROC AUC: {roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1])}')


              precision    recall  f1-score   support

           0       0.84      0.98      0.90      1607
           1       0.71      0.23      0.35       393

    accuracy                           0.83      2000
   macro avg       0.78      0.60      0.62      2000
weighted avg       0.81      0.83      0.79      2000

ROC AUC: 0.8066252765018187


In [None]:
#ANN
# Train the model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_mlp = mlp_model.predict(X_test)
print(classification_report(y_test, y_pred_mlp))
print(f'ROC AUC: {roc_auc_score(y_test, mlp_model.predict_proba(X_test)[:, 1])}')


              precision    recall  f1-score   support

           0       0.88      0.95      0.92      1607
           1       0.71      0.49      0.58       393

    accuracy                           0.86      2000
   macro avg       0.80      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000

ROC AUC: 0.8499234424456614


In [None]:
#xgb

# Training the model
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Predicting and evaluating
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))
print(f'ROC AUC: {roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])}')


              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1607
           1       0.69      0.49      0.57       393

    accuracy                           0.86      2000
   macro avg       0.79      0.72      0.74      2000
weighted avg       0.84      0.86      0.85      2000

ROC AUC: 0.8399590848561714


In [None]:
#AdaBoostClassifier

# Training the model
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train, y_train)

# Predicting and evaluating
y_pred_ada = ada_model.predict(X_test)
print(classification_report(y_test, y_pred_ada))
print(f'ROC AUC: {roc_auc_score(y_test, ada_model.predict_proba(X_test)[:, 1])}')


              precision    recall  f1-score   support

           0       0.88      0.95      0.92      1607
           1       0.71      0.48      0.57       393

    accuracy                           0.86      2000
   macro avg       0.79      0.72      0.74      2000
weighted avg       0.85      0.86      0.85      2000

ROC AUC: 0.8425907013051993
