In [56]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [57]:
# Load your dataset
data = pd.read_csv('BankChurners.csv')
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [58]:
# Handle missing values by removing rows with missing data for numerical columns
numerical_features = ['Customer_Age', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
                      'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal']
data = data.dropna(subset=numerical_features)

In [59]:
# Feature selection
selected_features = ['Customer_Age', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category',
                     'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
                     'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal']

In [60]:
# Check if the selected categorical features exist in the dataset
categorical_features = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category']
missing_columns = [col for col in categorical_features if col not in data.columns]

if missing_columns:
    print(f"Warning: The following columns are missing in the dataset: {missing_columns}")
else:
    # Label encoding for categorical variables
    label_encoder = LabelEncoder()
    for col in categorical_features:
        data[col] = label_encoder.fit_transform(data[col])


In [61]:
# Step 3: Data Splitting
X = data[selected_features]
y = data['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
# Multinomial Naive Bayes for categorical features
categorical_model = MultinomialNB()
categorical_model.fit(X_train[categorical_features], y_train)

# Gaussian Naive Bayes for numerical features
numerical_model = GaussianNB()
numerical_model.fit(X_train.drop(columns=categorical_features), y_train)

In [74]:
# Step 6: Model Evaluation

# Predict on the test data
categorical_pred = categorical_model.predict(X_test[categorical_features])
numerical_pred = numerical_model.predict(X_test.drop(columns=categorical_features))


In [75]:
# Convert predictions to boolean arrays
categorical_pred = categorical_pred == 'Attrited Customer'
numerical_pred = numerical_pred == 'Attrited Customer'

# Combine predictions using logical AND
combined_pred = np.logical_and(categorical_pred, numerical_pred)

In [79]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Assuming you have combined_pred as the combined predictions
# Assuming y_test as the true labels for the test set

# Convert 'Attrited Customer' to 1 and 'Existing Customer' to 0 in both y_true and combined_pred
y_true = y_test.map({'Existing Customer': 0, 'Attrited Customer': 1})
combined_pred = combined_pred.astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_true, combined_pred)

# Calculate precision
precision = precision_score(y_true, combined_pred)

# Calculate recall
recall = recall_score(y_true, combined_pred)

# Calculate F1-score
f1 = f1_score(y_true, combined_pred)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_true, combined_pred)

# Print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')
print('Confusion Matrix:')
print(conf_matrix)








Accuracy: 0.84
Precision: 0.00
Recall: 0.00
F1-score: 0.00
Confusion Matrix:
[[1699    0]
 [ 327    0]]


  _warn_prf(average, modifier, msg_start, len(result))
