# feature_matrix_by_kageyama model 
## random and grid search parameter tuning


1. random search for RANDOM_MAX_EVALS times
2. grid search around random search's best parameter

In [2]:
RANDOM_MAX_EVALS = 100
GRID_MAX_EVALS = 50
N_FOLDS = 5

In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold, train_test_split
from lightgbm import LGBMClassifier

import time
import gc
from tqdm import tqdm

import random
import lightgbm as lgb

In [4]:
!ls ../input

HomeCredit_columns_description.csv  bureau_balance.csv
POS_CASH_balance.csv		    credit_card_balance.csv
application_test.csv		    installments_payments.csv
application_train.csv		    previous_application.csv
bureau.csv			    sample_submission.csv


In [5]:
pd.options.display.max_columns = None

In [6]:
gc.enable()
# Build model inputs
df = pd.read_csv('./kageyama_feature_matrix_96.0.csv')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_feats = [f for f in df.columns if df[f].dtype == 'object']
for col in tqdm(categorical_feats):
    if col == 'set' :
        continue
    df[col] = df[col].astype('str')
    le.fit(df[col])
    df[col] = le.transform(df[col])
train_df = df.loc[df['set'] == 'train'] .drop('set', axis = 1)
# test_df = df.loc[df['set'] == 'test'] .drop(['set', 'TARGET'], axis=1)

del df
gc.collect()

# Split into training and testing data
train_features, test_features, train_labels, test_labels = train_test_split(train_df.drop('TARGET', axis = 1), train_df['TARGET'], test_size = 6000, random_state = 50)
# Create a training and testing dataset
train_set = lgb.Dataset(data = train_features, label = train_labels)
test_set = lgb.Dataset(data = test_features, label = test_labels)

100%|██████████| 21/21 [00:20<00:00,  1.05it/s]


In [7]:
def objective(hyperparameters, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""
    
    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
     # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 42)
    
    # results to retun
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators 
    
    return [score, hyperparameters, iteration]

In [8]:
def random_search(param_rand, max_evals = RANDOM_MAX_EVALS):
    """Random search for hyperparameter optimization"""
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(RANDOM_MAX_EVALS)))
    
    # Keep searching until reach max evaluations
    for i in tqdm(range(RANDOM_MAX_EVALS)):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_rand.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']

        # Evaluate randomly selected hyperparameters
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
    
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results 

In [11]:
# Hyperparameter grid
param_rand = {
#     'boosting_type': ['gbdt', 'goss', 'dart'],
    'boosting_type': ['gbdt'],
    
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.05), base = 10, num = 1000)),
    'num_leaves': list(range(16, 128)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    
    'reg_alpha': list(np.linspace(0, 0.3)),
    'reg_lambda': list(np.linspace(0, 0.3)),
    
    'min_split_gain': list(np.linspace(0, 0.3)),
    'min_child_weight': list(range(16, 128)),
    
#     'subsample_for_bin': list(range(20000, 300000, 20000)),
#     'min_child_samples': list(range(20, 500, 5)),
#     'is_unbalance': [True, False]
}

In [12]:
ra_st = time.time()
random_results = random_search(param_rand)
print("%f s" % (time.time() - ra_st))

100%|██████████| 100/100 [54:48:15<00:00, 1972.95s/it]  

197295.330423 s





In [13]:
random_results

Unnamed: 0,index,score,params,iteration
0,43,0.788759,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",43
1,38,0.788543,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",38
2,87,0.788478,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",87
3,82,0.788112,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",82
4,56,0.787994,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",56
5,96,0.787971,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",96
6,62,0.78791,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",62
7,12,0.787661,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",12
8,93,0.787656,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",93
9,17,0.787639,"{'boosting_type': 'gbdt', 'learning_rate': 0.0...",17


In [14]:
param_rand_best = random_results['params'][0]

In [15]:
param_rand_best

{'boosting_type': 'gbdt',
 'learning_rate': 0.008437178388636878,
 'num_leaves': 126,
 'colsample_bytree': 0.6444444444444444,
 'subsample': 0.5656565656565656,
 'reg_alpha': 0.2571428571428571,
 'reg_lambda': 0.11632653061224489,
 'min_split_gain': 0.17755102040816326,
 'min_child_weight': 88,
 'metric': 'auc',
 'verbose': 1,
 'n_estimators': 1785}

In [None]:
param_grid = {
    'boosting_type': [param_rand_best['boosting_type']],
    'num_leaves': list(range(max(1, param_rand_best['num_leaves'] - 10), param_rand_best['num_leaves'] +11, 10)),
    'learning_rate': list(np.logspace(np.log10(param_rand_best['learning_rate'] * 0.9 ), np.log10(param_rand_best['learning_rate'] * 1.1), base = 10, num = 3)),
    'subsample_for_bin': list(range(max(20000, param_rand_best['subsample_for_bin'] - 20000), param_rand_best['subsample_for_bin'] + 20001, 20000)),
    'min_child_samples': list(range(param_rand_best['min_child_samples'], 500, 3)),
    'reg_alpha': list(np.linspace(param_rand_best['reg_alpha'] * 0.9, min(param_rand_best['reg_alpha'] * 1.1, 1.0), 3)),
    'reg_lambda': list(np.linspace(param_rand_best['reg_lambda'] * 0.9, min(param_rand_best['reg_lambda'] * 1.1, 1.0), 3)),
    'colsample_bytree': list(np.linspace(param_rand_best['colsample_bytree'] * 0.9, min(param_rand_best['colsample_bytree'] * 1.1, 1.0), 3)),
    'subsample': list(np.linspace(param_rand_best['subsample'] * 0.9, min(param_rand_best['subsample'] * 0.9, 1.0), 3)),
    'is_unbalance': [param_rand_best['is_unbalance']]
}

In [None]:
import itertools

def grid_search(param_grid, max_evals = GRID_MAX_EVALS):
    """Grid search algorithm (with limit on max evals)"""
    
    # Dataframe to store results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(GRID_MAX_EVALS)))
    
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    keys, values = zip(*param_grid.items())
    
    i = 0
    
    # Iterate through every possible combination of hyperparameters
    for v in tqdm(itertools.product(*values)):
        
        # Create a hyperparameter dictionary
        hyperparameters = dict(zip(keys, v))
        
        # Set the subsample ratio accounting for boosting type
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        # Evalute the hyperparameters
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
        
        i += 1
        
        # Normally would not limit iterations
        if i > GRID_MAX_EVALS:
            break
       
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results

In [None]:
gr_st = time.time()
grid_results = grid_search(param_grid)
print("grid search %f s" % (time.time() - gr_st))
print('The best validation score was {:.5f}'.format(grid_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

import pprint
pprint.pprint(grid_results.loc[0, 'params'])