In [1]:
# Import the requried packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model, neighbors, tree, svm, ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, cross_validate

import warnings

warnings.filterwarnings('ignore')

In [2]:
# Loading the data
bank = pd.read_csv('bank.csv')
print(bank.head())
# Encode the categorical data into numbers
bank_cat = bank.select_dtypes(['object']).copy()
for col in bank_cat:
    print(col, ':')
    codes, uniques = pd.factorize(bank_cat[col], sort=True)
    bank[col]=codes
    print(uniques)
print(bank.head())

# Sample a small subset of the data
bank = bank.sample(3000, random_state =5)
print('\n samples by target categories:')
print(bank.deposit.value_counts())
  
# Set the "deposit" as target/model output and the rest features as model inputs
y = bank['deposit']
X = bank.drop(['deposit'], axis=1)

# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#Sacaling the data into a smaller range (-3 to +3)
mean = X_train.mean()
std = X_train.std()

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std


   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown     yes  
3    5   may       579         1     -1         0  unknown     yes  
4    5   may       673         2     -1         0  unknown     yes  
job :
Index(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 

## Classification

In [4]:
#Random Forest Model
rf_clf = ensemble.RandomForestClassifier()
param_grid = {"n_estimators" : [10, 20, 30, 100, 1000],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [3, 9, 10, 27, 50],
              'max_features' : ['auto', 'sqrt', 'log2'] 
             }

gs_clf = GridSearchCV(rf_clf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
                                                                       
gs_clf.fit(X_train, y_train)

In [6]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)

-0.16375
{'criterion': 'entropy', 'max_depth': 50, 'max_features': 'auto', 'n_estimators': 1000}
RandomForestClassifier(criterion='entropy', max_depth=50, max_features='auto',
                       n_estimators=1000)


In [7]:
#Final Model
rf_clf = gs_clf.best_estimator_
rf_clf.fit(X_train, y_train)

print('train_acc:', rf_clf.score(X_train, y_train), 
      '\n test_acc:', rf_clf.score(X_test, y_test))

train_acc: 1.0 
 test_acc: 0.8316666666666667


## regression

In [8]:
#Random Forest Model
rf_reg = ensemble.RandomForestClassifier()
param_grid = {"n_estimators" : [10, 20, 30, 100, 1000],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [3, 9, 10, 27, 50],
              'max_features' : ['auto', 'sqrt', 'log2'] 
             }

gs_reg = GridSearchCV(rf_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
                                                                       
gs_reg.fit(X_train, y_train)

In [9]:
print(gs_reg.best_score_)
print(gs_reg.best_params_)
print(gs_reg.best_estimator_)

-0.16333333333333333
{'criterion': 'entropy', 'max_depth': 27, 'max_features': 'log2', 'n_estimators': 100}
RandomForestClassifier(criterion='entropy', max_depth=27, max_features='log2')


In [10]:
#Final Model
rf_reg = gs_reg.best_estimator_
rf_reg.fit(X_train, y_train)

print('train_mae:', mean_absolute_error(rf_reg.predict(X_train), y_train), 
      '\n test_mae:', mean_absolute_error(rf_reg.predict(X_test), y_test))

train_mae: 0.0 
 test_mae: 0.16833333333333333
