In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/Churn_Modelling.csv')

# Display the first few rows of the dataset to understand its structure
df.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

# Drop unnecessary columns
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
y = df['Exited']

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,1,15634602,Hargrave,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,2,15647311,Hill,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,3,15619304,Onio,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,4,15701354,Boni,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,5,15737888,Mitchell,850,43,2,125510.82,1,1,1,79084.1,0,False,True,False


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the training and testing sets
rf_train_preds = rf_model.predict(X_train)
rf_test_preds = rf_model.predict(X_test)

# Calculate training and testing accuracy
rf_train_accuracy = accuracy_score(y_train, rf_train_preds)
rf_test_accuracy = accuracy_score(y_test, rf_test_preds)

print(f"Random Forest Training Accuracy: {rf_train_accuracy:.4f}")
print(f"Random Forest Testing Accuracy: {rf_test_accuracy:.4f}")


Random Forest Training Accuracy: 1.0000
Random Forest Testing Accuracy: 0.8670


In [6]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the training and testing sets
xgb_train_preds = xgb_model.predict(X_train)
xgb_test_preds = xgb_model.predict(X_test)

# Calculate training and testing accuracy
xgb_train_accuracy = accuracy_score(y_train, xgb_train_preds)
xgb_test_accuracy = accuracy_score(y_test, xgb_test_preds)

print(f"XGBoost Training Accuracy: {xgb_train_accuracy:.4f}")
print(f"XGBoost Testing Accuracy: {xgb_test_accuracy:.4f}")


Parameters: { "use_label_encoder" } are not used.



XGBoost Training Accuracy: 0.9585
XGBoost Testing Accuracy: 0.8640


In [10]:

###even if you tune your hyper parameters manually it still shows the same


from xgboost import XGBClassifier

# Initialize and train the XGBoost model with regularization and early stopping
xgb_model1 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42,
                          reg_lambda=2.0, reg_alpha=1.0, max_depth=2, min_child_weight=10,
                          subsample=0.8, colsample_bytree=0.8, learning_rate=0.01, n_estimators=500)

xgb_model1.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)





# Predict on the training and testing sets
xgb_train_preds1 = xgb_model.predict(X_train)
xgb_test_preds1 = xgb_model.predict(X_test)

# Calculate training and testing accuracy
xgb_train_accuracy = accuracy_score(y_train, xgb_train_preds1)
xgb_test_accuracy = accuracy_score(y_test, xgb_test_preds1)

print(f"XGBoost Training Accuracy: {xgb_train_accuracy:.4f}")
print(f"XGBoost Testing Accuracy: {xgb_test_accuracy:.4f}")


[0]	validation_0-logloss:0.49801
[1]	validation_0-logloss:0.49672
[2]	validation_0-logloss:0.49542
[3]	validation_0-logloss:0.49415
[4]	validation_0-logloss:0.49228
[5]	validation_0-logloss:0.49076
[6]	validation_0-logloss:0.48905
[7]	validation_0-logloss:0.48740
[8]	validation_0-logloss:0.48622
[9]	validation_0-logloss:0.48468
[10]	validation_0-logloss:0.48307
[11]	validation_0-logloss:0.48170
[12]	validation_0-logloss:0.48018
[13]	validation_0-logloss:0.47868
[14]	validation_0-logloss:0.47762
[15]	validation_0-logloss:0.47658
[16]	validation_0-logloss:0.47530
[17]	validation_0-logloss:0.47400
[18]	validation_0-logloss:0.47277
[19]	validation_0-logloss:0.47160
[20]	validation_0-logloss:0.47060
[21]	validation_0-logloss:0.46938
[22]	validation_0-logloss:0.46820
[23]	validation_0-logloss:0.46726
[24]	validation_0-logloss:0.46597
[25]	validation_0-logloss:0.46484
[26]	validation_0-logloss:0.46392
[27]	validation_0-logloss:0.46276
[28]	validation_0-logloss:0.46186
[29]	validation_0-loglos

Parameters: { "use_label_encoder" } are not used.



[42]	validation_0-logloss:0.44893
[43]	validation_0-logloss:0.44790
[44]	validation_0-logloss:0.44690
[45]	validation_0-logloss:0.44647
[46]	validation_0-logloss:0.44561
[47]	validation_0-logloss:0.44518
[48]	validation_0-logloss:0.44429
[49]	validation_0-logloss:0.44333
[50]	validation_0-logloss:0.44256
[51]	validation_0-logloss:0.44167
[52]	validation_0-logloss:0.44095
[53]	validation_0-logloss:0.44025
[54]	validation_0-logloss:0.43935
[55]	validation_0-logloss:0.43850
[56]	validation_0-logloss:0.43783
[57]	validation_0-logloss:0.43716
[58]	validation_0-logloss:0.43643
[59]	validation_0-logloss:0.43575
[60]	validation_0-logloss:0.43494
[61]	validation_0-logloss:0.43430
[62]	validation_0-logloss:0.43361
[63]	validation_0-logloss:0.43295
[64]	validation_0-logloss:0.43229
[65]	validation_0-logloss:0.43160
[66]	validation_0-logloss:0.43088
[67]	validation_0-logloss:0.43018
[68]	validation_0-logloss:0.42950
[69]	validation_0-logloss:0.42887
[70]	validation_0-logloss:0.42814
[71]	validatio

In [11]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid to search
param_grid = {
    'max_depth': [2, 3, 4],
    'min_child_weight': [1, 5, 10],
    'reg_lambda': [1.0, 2.0, 3.0],
    'reg_alpha': [0.0, 0.5, 1.0],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize the XGBoost model
xgb_model2 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set up Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters found by Grid Search
print("Best Hyperparameters:", grid_search.best_params_)

# Predict using the best model from grid search
best_xgb_model = grid_search.best_estimator_
best_xgb_train_preds = best_xgb_model.predict(X_train)
best_xgb_test_preds = best_xgb_model.predict(X_test)

# Calculate training and testing accuracy
best_xgb_train_accuracy = accuracy_score(y_train, best_xgb_train_preds)
best_xgb_test_accuracy = accuracy_score(y_test, best_xgb_test_preds)

print(f"Best XGBoost Training Accuracy: {best_xgb_train_accuracy:.4f}")
print(f"Best XGBoost Testing Accuracy: {best_xgb_test_accuracy:.4f}")


Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


KeyboardInterrupt: 