In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
#Load Dataset
df=pd.read_csv('/Users/zem/Documents/PyCharm/cpm/chm.csv')
df.head()

In [None]:
df.info() #Strucutre and nulls


In [None]:
df[df.duplicated()] #Check for duplicates

In [None]:
df = pd.read_csv("chm.csv")
#Encoding
label_encoder = LabelEncoder()
df['Gender']=label_encoder.fit_transform(df['Gender'])
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)


In [None]:
df.head()

In [None]:

#Define feaure columns
Features=['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Germany', 'Geography_Spain']
#Define features (X) and target(y)
X=df[Features]
y=df['Exited']
#Split dataset into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Train and Test Split and Scaling
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
X_train[:5], X_test[:5]

In [None]:
#Random Forest Model
model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
#Continued
y_pred= model.predict(X_test)
conf_matrix=confusion_matrix(y_test, y_pred)
class_report=classification_report(y_test, y_pred)
accuracy=accuracy_score(y_test, y_pred)

In [None]:
#Print the confusion matrix, classification report, and the accuracy of the model.
print(conf_matrix,class_report,accuracy)

In [None]:
#Feature Importance
importances=model.feature_importances_
indices=np.argsort(importances)[::-1]
names=[Features[i] for i in indices]
plt.figure(figsize=(10,6))
plt.title=("Feature Importance")
plt.barh(range(X.shape[1]), importances[indices])
plt.yticks(range(X.shape[1]), names)
plt.show()




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Build and train the Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
#Print the confusion matrix, classification report, and accuracy of the LRM model
print(conf_matrix_log_reg, class_report_log_reg, accuracy_log_reg)





In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Build and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_svm = accuracy_score(y_test, y_pred_svm)


#Print the confusion matrix, classificaiton report, and the accuracy of the SVM model
print(conf_matrix_svm, class_report_svm,accuracy_svm)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Build and train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
class_report_knn = classification_report(y_test, y_pred_knn)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
#Print the confusion matrix, classificaiton report, and the accuracy of the KNN model
print(conf_matrix_knn, class_report_knn, accuracy_knn)




In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Build and train the Gradient Boosting model
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)

# Make predictions
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the model
conf_matrix_gbm = confusion_matrix(y_test, y_pred_gbm)
class_report_gbm = classification_report(y_test, y_pred_gbm)
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
#Print the confusion matrix, classificaiton report, and the accuracy of the GBM model
print(conf_matrix_gbm, class_report_gbm, accuracy_gbm)

In [None]:
df = pd.read_csv("chm.csv")
# Binary feature for Balance
df['BalanceZero'] = (df['Balance'] == 0).astype(int)

# Age groups
df['AgeGroup'] = pd.cut(df['Age'],
                        bins=[18, 25, 35, 45, 55, 65, 75, 85, 95],
                        labels=['18-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76-85', '86-95'])

# Balance to Salary Ratio
df['BalanceToSalaryRatio'] = df['Balance'] / df['EstimatedSalary']

# Interaction feature between NumOfProducts and IsActiveMember
df['ProductUsage'] = df['NumOfProducts'] * df['IsActiveMember']

# Tenure grouping
df['TenureGroup'] = pd.cut(df['Tenure'],
                           bins=[0, 2, 3, 5, 7, 10],
                           labels=['0-2', '3-5', '6-7', '8-10', '10+'])


In [None]:
# Encode Gender
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# One-hot encode Geography
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

# Create Male_Germany and Male_Spain interaction features
df['Male_Germany'] = df['Gender'] * df['Geography_Germany']
df['Male_Spain'] = df['Gender'] * df['Geography_Spain']


In [None]:
df = pd.get_dummies(df, columns=['AgeGroup', 'TenureGroup'], drop_first=True)


In [None]:
# Manually listed features plus all dummy variables starting with AgeGroup_ or TenureGroup_
features = [
    'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
    'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
    'Geography_Germany', 'Geography_Spain',
    'BalanceZero', 'BalanceToSalaryRatio', 'ProductUsage',
    'Male_Germany', 'Male_Spain'
] + [col for col in df.columns if col.startswith('AgeGroup_') or col.startswith('TenureGroup_')]

X = df[features]
y = df['Exited']


In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Fit the scaler on the training data and transform it
X_train = scaler.fit_transform(X_train)
#Same for the test data
X_test = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
#Train the model on the scaled training data
model.fit(X_train, y_train)
#Make the predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Create the confusion matrix to analyze prediction results
conf_matrix = confusion_matrix(y_test, y_pred)
#Generate a detailed classification report
class_report = classification_report(y_test, y_pred)
# Calculate overall model accuracy
accuracy= accuracy_score(y_test, y_pred)

In [None]:
#Print the confusion matrix, classification report, and the overall accuracy of the model.
print(conf_matrix, class_report,accuracy)

