# Load File

In [1]:
import pandas as pd
#Read the CSV file 'Churn_Modelling.csv' into a DataFrame named 'df' 
df = pd.read_csv('/Users/laiminyun/CIS 508/Group Project/Churn_Modelling.csv')

In [2]:
# Show the first few rows to understand the structure and content
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,$0.00,1,1,1,"$101,348.88",1
1,2,15647311,Hill,608,Spain,Female,41,1,"$83,807.86",1,0,1,"$112,542.58",0
2,3,15619304,Onio,502,France,Female,42,8,"$159,660.80",3,1,0,"$113,931.57",1
3,4,15701354,Boni,699,France,Female,39,1,$0.00,2,0,0,"$93,826.63",0
4,5,15737888,Mitchell,850,Spain,Female,43,2,"$125,510.82",1,1,1,"$79,084.10",0


In [3]:
# Drop rows where the target variable or any of the specified columns have missing values
df = df.dropna()
df.shape

(10000, 14)

# Convert categorical variables to numerical variables ( 0 and 1)

In [4]:
# Geography column
unique_Geography = df['Geography'].unique()
df = pd.get_dummies(df, columns=['Geography'], prefix='Geography')

In [5]:
# EstimatedSalary Column
# Remove currency symbols and commas
df['EstimatedSalary'] = df['EstimatedSalary'].str.replace('[\$,]', '', regex=True)

# Convert the column to a numeric type
df['EstimatedSalary'] = pd.to_numeric(df['EstimatedSalary'])

In [6]:
# Balance Column
# Remove currency symbols and commas
df['Balance'] = df['Balance'].str.replace('[\$,]', '', regex=True)

# Convert the column to a numeric type
df['Balance'] = pd.to_numeric(df['Balance'])

In [7]:
# Assume 'RowNumber','CustomerId', 'Surname','Gender' are irrelevant features
df.drop(['RowNumber','CustomerId', 'Surname','Gender'], axis=1, inplace=True)

In [8]:
# Separating the features and the target variable
X = df.drop('Exited', axis=1)  # Features
y = df['Exited']  # Target

# Modeling

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Scaling the features
scaler = StandardScaler()


# Separate features and target variable for training and test sets
X_train = df.drop(columns=['Exited'])
y_train = df['Exited']

X_test = df.drop(columns=['Exited'])
y_test = df['Exited']

# Manual Tuning

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Example of manual tuning

# Define your range of hyperparameters
hidden_layer_sizes = [(50,), (100,), (50, 50), (100, 100)]
learning_rates = [0.01, 0.001, 0.0001]

best_accuracy = 0
best_params = {}

for size in hidden_layer_sizes:
    for lr in learning_rates:
        model = MLPClassifier(hidden_layer_sizes=size, learning_rate_init=lr, max_iter=100)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'hidden_layer_sizes': size, 'learning_rate_init': lr}

print("Best Accuracy:", best_accuracy)
print("Best Parameters:", best_params)




Best Accuracy: 0.7963
Best Parameters: {'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.01}


In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# Define your range of hyperparameters
hidden_layer_sizes = [(50,), (100,), (50, 50), (100, 100)]
learning_rates = [0.01, 0.001, 0.0001]

best_accuracy = 0
best_precision = [0, 0]  # For class 0 and class 1
best_recall = [0, 0]  # For class 0 and class 1
best_f1_score = [0, 0]  # For class 0 and class 1
best_params = {}

for size in hidden_layer_sizes:
    for lr in learning_rates:
        model = MLPClassifier(hidden_layer_sizes=size, learning_rate_init=lr, max_iter=100)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        accuracy = accuracy_score(y_test, predictions)
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, predictions)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_precision = precision
            best_recall = recall
            best_f1_score = f1_score
            best_params = {'hidden_layer_sizes': size, 'learning_rate_init': lr}

print("Best Accuracy:", best_accuracy)
print("Best Precision for Class 0:", best_precision[0], "Class 1:", best_precision[1])
print("Best Recall for Class 0:", best_recall[0], "Class 1:", best_recall[1])
print("Best F1-Score for Class 0:", best_f1_score[0], "Class 1:", best_f1_score[1])
print("Best Parameters:", best_params)


  _warn_prf(average, modifier, msg_start, len(result))


Best Accuracy: 0.7963
Best Precision for Class 0: 0.7963 Class 1: 0.0
Best Recall for Class 0: 1.0 Class 1: 0.0
Best F1-Score for Class 0: 0.8866002338139509 Class 1: 0.0
Best Parameters: {'hidden_layer_sizes': (100, 100), 'learning_rate_init': 0.01}


# Automated Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'learning_rate_init': [0.01, 0.001, 0.0001],
}

# Initialize the grid search
grid_search = GridSearchCV(MLPClassifier(max_iter=100), param_grid, cv=5, scoring='accuracy')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters and best accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)




Best Parameters: {'hidden_layer_sizes': (100, 100), 'learning_rate_init': 0.01}
Best Accuracy: 0.7962


# RandomizedSearch

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Define parameter distribution
param_dist = {
    'hidden_layer_sizes': sp_randint(50, 200),
    'learning_rate_init': [0.01, 0.001, 0.0001],
}

# Initialize the randomized search
random_search = RandomizedSearchCV(MLPClassifier(max_iter=100), param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')

# Fit randomized search
random_search.fit(X_train, y_train)

# Best parameters and best accuracy
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)




Best Parameters: {'hidden_layer_sizes': 117, 'learning_rate_init': 0.01}
Best Accuracy: 0.7947


1. Manual Tuning <br/>
Approach: You manually change hyperparameters and observe the impact on model performance. This process is repeated iteratively.<br/>

Pros:<br/>
Full control over the tuning process.<br/>
Good for understanding the impact of each hyperparameter.<br/>
Cons:<br/>
Time-consuming and labor-intensive.<br/>
Risk of missing the optimal combination due to limited exploration.<br/>
<br/>
2. GridSearchCV<br/>
Approach: Performs an exhaustive search over a specified parameter grid. It trains the model for each combination of parameters and evaluates using cross-validation.<br/>

Pros:<br/>
Comprehensive: tests every possible combination in the grid, ensuring that the best combination is found.<br/>
Automated: once set up, it runs without the need for manual intervention.<br/>
Cons:<br/>
Computationally expensive, especially with large grids or complex models.<br/>
The grid needs to be well-defined, which might require prior knowledge or assumptions about which values are likely to be effective.<br/>
<br/>
3. RandomizedSearchCV<br/>
Approach: Samples a given number of candidates from a parameter space with a specified distribution. Like GridSearch, it uses cross-validation for evaluation.<br/>

Pros:<br/>
More efficient than GridSearchCV as it doesn’t try all possible combinations, but rather a fixed number of parameter settings from the distributions specified.<br/>
Can reach close to the best combination with significantly less computation.<br/>
Useful when there are many hyperparameters and the best combination is unknown.<br/>
Cons:<br/>
Less comprehensive than GridSearchCV as it might miss the optimal parameters.<br/>
The choice of the distribution for parameters and the number of iterations can influence the effectiveness of the search.<br/>
<br/>
Summary<br/>
Manual Tuning is great for a deep understanding of each parameter's effect but is less efficient and systematic.
GridSearchCV is the most comprehensive but can be very slow and computationally expensive.
RandomizedSearchCV provides a good balance between thoroughness and computational efficiency, making it suitable for initial exploration, especially when the hyperparameter space is large.

# Summary