In [1]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import f1_score
#from catboost import CatBoostClassifier
import catboost as cb
import lightgbm as lgb
import hyperopt
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
from tsfresh.feature_extraction import feature_calculators
import pickle



In [2]:
TRAINING_DATA_FILE = '/kaggle/input/challenge-data-analysis/all_training_data.csv'
TRAINING_LABEL_FILE = '/kaggle/input/challenge-data-analysis/all_training_labels.csv'

In [3]:
def lgb_hyperopt(data, labels, num_evals=1000, n_folds=6, diagnostic=False):

    LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
    LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM 

    def lgb_f1_score(y_hat, data):
        y_true = 1 - data.get_label()
        y_hat = 1 - np.round(y_hat) # scikits f1 doesn't like probabilities
        return 'f1', f1_score(y_true, y_hat), True

    print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
    #clear space
        
    integer_params = ['max_depth',
                      'num_leaves',
                      'max_bin',
                      'min_data_in_leaf',
                      'min_data_in_bin']
        
    def objective(space_params):
            
        #cast integer params from float to int
        for param in integer_params:
            space_params[param] = int(space_params[param])
            
        #extract nested conditional parameters
        if space_params['boosting']['boosting'] == 'goss':
            top_rate = space_params['boosting'].get('top_rate')
            other_rate = space_params['boosting'].get('other_rate')
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            space_params['top_rate'] = top_rate
            space_params['other_rate'] = other_rate
            
        subsample = space_params['boosting'].get('subsample', 1.0)
        space_params['boosting'] = space_params['boosting']['boosting']
        space_params['subsample'] = subsample
            
        cv_results = lgb.cv(space_params, train, nfold = n_folds, stratified=True,
                            early_stopping_rounds=100, seed=42, feval=lgb_f1_score)
            
        best_loss = -max(cv_results['f1-mean'])
        #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
        return{'loss':best_loss, 'status': STATUS_OK }
        
    train = lgb.Dataset(data, labels)
                
    #integer and string parameters, used with hp.choice()
    boosting_list = [{'boosting': 'gbdt',
                      'subsample': hp.uniform('subsample', 0.5, 1)},
                      {'boosting': 'goss',
                       'subsample': 1.0,
                       'top_rate': hp.uniform('top_rate', 0, 0.5),
                       'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'

    objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
    objective_list_class = ['binary', 'cross_entropy']
    objective_list = objective_list_class
    is_unbalance_list = [True]

    space = {'boosting' : hp.choice('boosting', boosting_list),
             'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
             'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
             'max_bin': hp.quniform('max_bin', 32, 255, 1),
             'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
             'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
             'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
             'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
             'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
             'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
             'metric' : None, 
             'objective' : hp.choice('objective', objective_list),
             'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
             'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01),
             'is_unbalance' : hp.choice('is_unbalance', is_unbalance_list)
        }

    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=num_evals, 
                trials=trials)
                
    #fmin() will return the index of values chosen from the lists/arrays in 'space'
    #to obtain actual values, index values are used to subset the original lists/arrays
    #extract nested conditional parameters
    try:
        if best['boosting']['boosting'] == 'goss':
            top_rate = best['boosting'].get('top_rate')
            other_rate = best['boosting'].get('other_rate')
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            best['top_rate'] = top_rate
            best['other_rate'] = other_rate
    except:
        if boosting_list[best['boosting']]['boosting'] == 'goss':
            top_rate = best['top_rate']
            other_rate = best['other_rate']
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            best['top_rate'] = top_rate
            best['other_rate'] = other_rate
    best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
    best['objective'] = objective_list[best['objective']]
    best['is_unbalance'] = is_unbalance_list[best['is_unbalance']]
                
    #cast floats of integer params to int
    for param in integer_params:
        best[param] = int(best[param])
        
    print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
    if diagnostic:
        return(best, trials)
    else:
        return(best)

In [4]:
training_data = pd.read_csv(TRAINING_DATA_FILE)\
                  .sample(frac=1, random_state=25).reset_index(drop=True)
column_list = list(training_data.columns)
for i in range(len(column_list)):
    column_list[i] = column_list[i].replace('[','(').replace(']',')').replace(':','_').replace(',',' ')
training_data.columns = column_list
print(training_data.columns)
training_labels = pd.read_csv(TRAINING_LABEL_FILE)\
                  .sample(frac=1, random_state=25).reset_index(drop=True)

lgb_best, lgb_trials = lgb_hyperopt(training_data, training_labels, 1000, diagnostic=True)
print("lgb best params:")
print(lgb_best)
np.save('lgb_best_params.npy', lgb_best)
with open('lgb_trials', "wb") as f:
    pickle.dump(lgb_trials, f)

Index(['Challenge Stats Project Category Name_Code',
       'Challenge Stats Project Category Name_Application Front-End Design',
       'Challenge Stats Project Category Name_Print/Presentation',
       'Challenge Stats Project Category Name_Marathon Match',
       'Challenge Stats Project Category Name_UI Prototype Competition',
       'Challenge Stats Project Category Name_Web Design',
       'Challenge Stats Project Category Name_Widget or Mobile Screen Design',
       'Challenge Stats Project Category Name_Assembly Competition',
       'Challenge Stats Project Category Name_Wireframes',
       'Challenge Stats Project Category Name_Idea Generation',
       ...
       'User Member Since Date Days from 2001_(3030.9666666666667 3490.5)',
       'User Member Since Date Days from 2001_(3490.5 3950.0333333333338)',
       'User Member Since Date Days from 2001_(3950.0333333333338 4409.566666666667)',
       'User Member Since Date Days from 2001_(4409.566666666667 4869.1)',
       'User