# Santander Customer Transaction Prediction - Random Forest Hyperparam Tuning

In the Kaggle competition, the objective is to identify which customer will make a transaction in the future.

**Link to the competition**: https://www.kaggle.com/c/santander-customer-transaction-prediction/  
**Type of Problem**: Classification  
**Metric for evalution**: AOC (Area Under Curve)

This Python 3 environment comes with many helpful analytics libraries installed
It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/santander-customer-transaction-prediction/sample_submission.csv
/kaggle/input/santander-customer-transaction-prediction/train.csv
/kaggle/input/santander-customer-transaction-prediction/test.csv


## Step 1: Import data from CSV

In [3]:
input_dir = '/kaggle/input/santander-customer-transaction-prediction/'
df_train = pd.read_csv(input_dir + "train.csv")
df_train

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,train_199995,0,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
199996,train_199996,0,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,train_199997,0,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,train_199998,0,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


In [4]:
var_columns =  [c for c in df_train.columns if c not in ['ID_code', 'target'] ]
X = df_train.loc[:, var_columns]
y = df_train.loc[:,'target']
X.shape, y.shape

((200000, 200), (200000,))

## Step2: RandomizedSearchCV
First, let us get the list of hyperparameters that can be tuned for a RandomForest Model

In [5]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

Specify the range of values for some important hyperparameters

In [6]:
n_estimators_list = list(range(10,220,50))
criterion_list = ['gini', 'entropy']
max_depth_list = list(range(5,41,10))
max_depth_list.append(None)
min_samples_split_list = [x/1000 for x in list(range(5, 41, 10))]
min_samples_leaf_list = [x/1000 for x in list(range(5, 41, 10))]
max_features_list = ['sqrt', 'log2']

params_grid = {
    'n_estimators': n_estimators_list,
    'criterion': criterion_list,
    'max_depth': max_depth_list,
    'min_samples_split': min_samples_split_list,
    'min_samples_leaf': min_samples_leaf_list,
    'max_features': max_features_list
}

num_combinations = 1
for k in params_grid.keys(): num_combinations *= len(params_grid[k])

print('Number of combinations = ', num_combinations)
params_grid

Number of combinations =  1600


{'n_estimators': [10, 60, 110, 160, 210],
 'criterion': ['gini', 'entropy'],
 'max_depth': [5, 15, 25, 35, None],
 'min_samples_split': [0.005, 0.015, 0.025, 0.035],
 'min_samples_leaf': [0.005, 0.015, 0.025, 0.035],
 'max_features': ['sqrt', 'log2']}

In [7]:
def my_roc_auc_score(model, X, y): return metrics.roc_auc_score(y, model.predict(X))

model_rf = RandomizedSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
                              param_distributions=params_grid,
                              n_iter=50,
                              cv=3,
                              scoring=my_roc_auc_score,
                              return_train_score=True,
                              verbose=2)

model_rf.fit(X,y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END criterion=gini, max_depth=15, max_features=log2, min_samples_leaf=0.015, min_samples_split=0.015, n_estimators=110; total time= 1.6min
[CV] END criterion=gini, max_depth=15, max_features=log2, min_samples_leaf=0.015, min_samples_split=0.015, n_estimators=110; total time= 1.6min
[CV] END criterion=gini, max_depth=15, max_features=log2, min_samples_leaf=0.015, min_samples_split=0.015, n_estimators=110; total time= 1.6min
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.005, min_samples_split=0.035, n_estimators=160; total time= 4.0min
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.005, min_samples_split=0.035, n_estimators=160; total time= 4.0min
[CV] END criterion=entropy, max_depth=35, max_features=log2, min_samples_leaf=0.005, min_samples_split=0.035, n_estimators=160; total time= 4.0min
[CV] END criterion=entropy, max_depth=None, max_features=log2, mi

RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(class_weight='balanced'),
                   n_iter=50,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 15, 25, 35, None],
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_leaf': [0.005, 0.015,
                                                             0.025, 0.035],
                                        'min_samples_split': [0.005, 0.015,
                                                              0.025, 0.035],
                                        'n_estimators': [10, 60, 110, 160,
                                                         210]},
                   return_train_score=True,
                   scoring=<function my_roc_auc_score at 0x7f7127d44320>,
                   verbose=2)

In [8]:
model_rf.best_params_

{'n_estimators': 160,
 'min_samples_split': 0.005,
 'min_samples_leaf': 0.005,
 'max_features': 'log2',
 'max_depth': 35,
 'criterion': 'entropy'}

In [20]:
df_cv_results = pd.DataFrame(model_rf.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                           'param_n_estimators', 'param_min_samples_split','param_min_samples_leaf',
                           'param_max_features', 'param_max_depth','param_criterion']]
df_cv_results.sort_values('rank_test_score', inplace=True)
df_cv_results[:20]

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion
0,1,0.759822,0.801006,160,0.005,0.005,log2,35.0,entropy
13,2,0.758988,0.797269,160,0.005,0.005,log2,15.0,gini
1,3,0.753134,0.795035,60,0.005,0.005,log2,35.0,entropy
2,4,0.75245,0.77115,210,0.035,0.015,log2,35.0,entropy
14,5,0.752053,0.770428,210,0.015,0.015,log2,25.0,gini
15,6,0.751434,0.770535,160,0.025,0.015,log2,,gini
16,7,0.750773,0.772705,110,0.015,0.015,log2,15.0,gini
3,8,0.750565,0.774517,110,0.035,0.005,log2,,entropy
4,9,0.750454,0.773879,160,0.035,0.005,log2,35.0,entropy
17,10,0.749629,0.768607,110,0.015,0.015,log2,35.0,gini


In [10]:
output_dir = '/kaggle/working/'
df_cv_results.to_csv(output_dir + '/random_search.csv', index=False)

## Step3: GridSearchCV

In [6]:
n_estimators_list = [130,160,190]
criterion_list = ['gini']
max_depth_list = [35,55]
min_samples_split_list = [0.001, 0.005]
min_samples_leaf_list = [0.001, 0.005]
max_features_list = ['log2']

params_grid = {
    'n_estimators': n_estimators_list,
    'criterion': criterion_list,
    'max_depth': max_depth_list,
    'min_samples_split': min_samples_split_list,
    'min_samples_leaf': min_samples_leaf_list,
    'max_features': max_features_list
}

num_combinations = 1
for k in params_grid.keys(): num_combinations *= len(params_grid[k])

print('Number of combinations = ', num_combinations)
params_grid

Number of combinations =  24


{'n_estimators': [130, 160, 190],
 'criterion': ['gini'],
 'max_depth': [35, 55],
 'min_samples_split': [0.001, 0.005],
 'min_samples_leaf': [0.001, 0.005],
 'max_features': ['log2']}

In [7]:
def my_roc_auc_score(model, X, y): return metrics.roc_auc_score(y, model.predict(X))

model_rf = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
                        param_grid=params_grid,
                        cv=3,
                        scoring=my_roc_auc_score,
                        return_train_score=True,
                        verbose=4)

model_rf.fit(X,y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=130; total time= 3.2min
[CV 2/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=130; total time= 3.3min
[CV 3/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=130; total time= 3.5min
[CV 1/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=160; total time= 4.2min
[CV 2/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=160; total time= 4.2min
[CV 3/3] END criterion=gini, max_depth=35, max_features=log2, min_samples_leaf=0.001, min_samples_split=0.001, n_estimators=160; total time= 4.3min
[CV 1/3] END criterion=gini, max_depth=35, max_feat

GridSearchCV(cv=3, estimator=RandomForestClassifier(class_weight='balanced'),
             param_grid={'criterion': ['gini'], 'max_depth': [35, 55],
                         'max_features': ['log2'],
                         'min_samples_leaf': [0.001, 0.005],
                         'min_samples_split': [0.001, 0.005],
                         'n_estimators': [130, 160, 190]},
             return_train_score=True,
             scoring=<function my_roc_auc_score at 0x7f0c01d1ac20>, verbose=4)

In [8]:
df_cv_results = pd.DataFrame(model_rf.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                           'param_n_estimators', 'param_min_samples_split','param_min_samples_leaf',
                           'param_max_features', 'param_max_depth','param_criterion']]
df_cv_results.sort_values('rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion
23,1,0.763181,0.803893,190,0.005,0.005,log2,55,gini
19,2,0.762302,0.80177,160,0.001,0.005,log2,55,gini
11,3,0.761781,0.804111,190,0.005,0.005,log2,35,gini
20,4,0.761746,0.804467,190,0.001,0.005,log2,55,gini
21,5,0.761744,0.80258,130,0.005,0.005,log2,55,gini
22,6,0.761503,0.803613,160,0.005,0.005,log2,55,gini
7,7,0.76093,0.803012,160,0.001,0.005,log2,35,gini
5,8,0.760826,0.852796,190,0.005,0.001,log2,35,gini
8,9,0.760666,0.802979,190,0.001,0.005,log2,35,gini
6,10,0.760483,0.802994,130,0.001,0.005,log2,35,gini


In [9]:
output_dir = '/kaggle/working/'
df_cv_results.to_csv(output_dir + '/grid_search.csv', index=False)

In [10]:
model_rf.best_params_

{'criterion': 'gini',
 'max_depth': 55,
 'max_features': 'log2',
 'min_samples_leaf': 0.005,
 'min_samples_split': 0.005,
 'n_estimators': 190}

## Step4: Create the final model and evaluate the performance

In [11]:
model_rf_fin = RandomForestClassifier(class_weight='balanced',
                                      criterion='gini',
                                      max_depth=55,
                                      max_features='log2',
                                      min_samples_leaf=0.005,
                                      min_samples_split=0.005,
                                      n_estimators=190)
model_rf_fin.fit(X,y)

RandomForestClassifier(class_weight='balanced', max_depth=55,
                       max_features='log2', min_samples_leaf=0.005,
                       min_samples_split=0.005, n_estimators=190)

In [13]:
print('AUC Score = {:.4f}'.format(metrics.roc_auc_score(y, model_rf_fin.predict(X))))

AUC Score = 0.7877
