In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Loading the dataset
data = pd.read_csv('data/bank-additional-full-preprocessed.csv')
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [19]:
from sklearn.preprocessing import LabelEncoder

cols = data.select_dtypes("object").columns

le = LabelEncoder()

data[cols] = data[cols].apply(le.fit_transform)

In [20]:
# Get Target data
y = data['y']

# Load X variable in to the dataframe
X = data.drop(['y'], axis=1)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

__Performing grid search for optimal Hyper-parameter tuning__

In [28]:
import numpy as np

# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num=10)]

# No of features to consider at each split
max_features = ['sqrt']

# Max no of levels in a tree
max_depth = [2,4]

# Minimum no of samples required to split a node
min_samples_split = [2,5]

# Minimum no of samples required at each leaf node
min_samples_leaf = [1,2]

# Method of selecting samples for training each tree
bootstrap = [True,False]

In [29]:
# Creating param grid
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

print(param_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [30]:
from sklearn.model_selection import GridSearchCV

# building the model
model = RandomForestClassifier()

# grid search
Grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2, n_jobs=4)

# fitting the model
Grid.fit(X_train, y_train)

Fitting 3 folds for each of 160 candidates, totalling 480 fits


In [32]:
# Checking for the best parameters
Grid.best_params_

{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 60}

In [33]:
# building the model based on optimal parameters
model = RandomForestClassifier(n_estimators = 60,min_samples_split=5,min_samples_leaf=1,max_features='sqrt',max_depth=4,bootstrap=False)

# fitting the model
model.fit(X_train, y_train)

__Check Accuracy__

In [37]:
print(f"Train Accuracy - : {model.score(X_train, y_train)}")
print(f"Test Accuracy - : {model.score(X_test, y_test)}")

Train Accuracy - : 0.8994485102840692
Test Accuracy - : 0.9002184996358339
