In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Loading dataset
data = pd.read_csv(r"C:\Users\user\Downloads\Churn_Modelling (1).csv")


# Removing unnecessary columns
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Encoding categorical variables
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
data = ct.fit_transform(data)

# Removing one dummy variable to avoid the dummy variable trap
data = data[:, 1:]

# Splitting dataset into training and testing sets
X = data[:, :-1]
y = data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Defining base models
rfc = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0)

# Defining ensemble model
estimators = [('rf', rfc), ('gb', gbc)]

ensemble = VotingClassifier(estimators, voting='hard')

# Training and evaluating ensemble model
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


# First, we need to import the necessary libraries and load the dataset.
# Now, we will perform some data preprocessing steps.
# Firstly, we will remove the unnecessary columns such as RowNumber, CustomerId, and Surname.
# Then, we will convert the string datatype columns Gender and Geography into float datatype using LabelEncoder and OneHotEncoder respectively.
# Next, we will split the dataset into training and testing sets, and perform feature scaling using StandardScaler.
# Now, we will define the base models that we will use in our ensemble. We will use RandomForestClassifier, GradientBoostingClassifier, and XGBClassifier as our base models.
# We will now define our ensemble model. We will use the VotingClassifier from sklearn.ensemble to combine our base models.
# Finally, we will train our ensemble model on the training set and evaluate its performance on the testing set.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

# Loading dataset
data = pd.read_csv(r"C:\Users\user\Downloads\Churn_Modelling (1).csv")

# Preprocessing the data
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data = pd.get_dummies(data, columns=['Geography'])
sc = StandardScaler()
data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']] = sc.fit_transform(data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']])

# Splitting the data into train and test sets
X = data.drop(['Exited'], axis=1)
y = data['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a heterogeneous ensemble model
clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
clf2 = GradientBoostingClassifier(random_state=1)
clf3 = LogisticRegression(random_state=1)
clf4 = GaussianNB()
clf5 = SVC(kernel='linear', probability=True, random_state=1)
clf6 = SVC(kernel='rbf', probability=True, random_state=1)
clf7 = SVC(kernel='poly', probability=True, random_state=1)
eclf = VotingClassifier(estimators=[('rf', clf1), ('gb', clf2), ('lr', clf3), ('gnb', clf4), ('svc1', clf5), ('svc2', clf6), ('svc3', clf7)], voting='soft')

# Training and evaluating the model
eclf.fit(X_train, y_train)
y_pred = eclf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Visualizing the results
sns.heatmap(cm, annot=True, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

scores = cross_val_score(eclf, X, y, cv=10)
sns.boxplot(y=scores)
plt.title('Cross-validation scores')
plt.show()

# Getting the predicted probabilities
y_score = eclf.predict_proba(X_test)[:,1]

# Plotting the Precision-Recall curve for the ensemble model
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
pr_auc = auc(recall, precision)
plt.figure()
plt.plot(recall, precision, lw=2, label='Precision-Recall curve (AUC = %0.2f)' % pr_auc)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.legend()
plt.show()

print('Accuracy score:', accuracy)
print('Cross-validation scores:', scores)
