In [None]:
# https://sadanand-singh.github.io/posts/treebasedmodels/
# https://sadanand-singh.github.io/posts/boostedtrees/

In [1]:
%reload_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
PATH_DATA = './data'
PATH_TRAIN = f'{PATH_DATA}/adult-training.csv'
PATH_TEST = f'{PATH_DATA}/adult-test.csv'

!head {PATH_TRAIN}
!wc -l {PATH_TRAIN}
!head {PATH_TEST}
!wc -l {PATH_TEST}

39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
31, 

In [5]:
columns = ['Age','Workclass','fnlgwt','Education','EdNum','MaritalStatus',
           'Occupation','Relationship','Race','Sex','CapitalGain','CapitalLoss',
           'HoursPerWeek','Country','Income']

df_train = pd.read_csv(PATH_TRAIN, names=columns)
df_test = pd.read_csv(PATH_TEST, names=columns, skiprows=1)

df_train.drop('fnlgwt', axis=1, inplace=True)
df_test.drop('fnlgwt', axis=1, inplace=True)

In [6]:
df_train.head()
df_train.info()
df_train.describe()

Unnamed: 0,Age,Workclass,Education,EdNum,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,Country,Income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
Age              32561 non-null int64
Workclass        32561 non-null object
Education        32561 non-null object
EdNum            32561 non-null int64
MaritalStatus    32561 non-null object
Occupation       32561 non-null object
Relationship     32561 non-null object
Race             32561 non-null object
Sex              32561 non-null object
CapitalGain      32561 non-null int64
CapitalLoss      32561 non-null int64
HoursPerWeek     32561 non-null int64
Country          32561 non-null object
Income           32561 non-null object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


Unnamed: 0,Age,EdNum,CapitalGain,CapitalLoss,HoursPerWeek
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,10.080679,1077.648844,87.30383,40.437456
std,13.640433,2.57272,7385.292085,402.960219,12.347429
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [7]:
def df_fix(df):
    df = df.copy()
    for c in df.columns:
        if df[c].dtype != 'int64':
            df[c] = df[c].replace(' ?', 'Unknown')
            df[c] = df[c].apply(lambda s:s.replace(' ', ''))
            df[c] = df[c].apply(lambda s:s.replace('.', ''))
    # remove columns
    df.drop(['Country', 'Education'], axis=1, inplace=True)
    # bin Age to AgeGroup
    labels = ["{0}-{1}".format(i, i + 9) for i in range(0, 100, 10)]
    df['AgeGroup'] = pd.cut(df.Age, range(0, 101, 10), right=False, labels=labels)
    # bin EdNum to Education
    labels = ["{0}-{1}".format(i, i + 4) for i in range(0, 20, 5)]
    df['Education'] = pd.cut(df.EdNum, range(0, 21, 5), right=False, labels=labels)
    # remove columns
    df.drop(['Age', 'EdNum'], axis=1, inplace=True)
    return df

df_train = df_fix(df_train)
df_test = df_fix(df_test)

df_train.Income.value_counts()
df_test.Income.value_counts()

<=50K    24720
>50K      7841
Name: Income, dtype: int64

<=50K    12435
>50K      3846
Name: Income, dtype: int64

In [8]:
df_train.head()
df_test.head()

Unnamed: 0,Workclass,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,Income,AgeGroup,Education
0,State-gov,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,<=50K,30-39,10-14
1,Self-emp-not-inc,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,<=50K,50-59,10-14
2,Private,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,<=50K,30-39,5-9
3,Private,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K,50-59,5-9
4,Private,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,<=50K,20-29,10-14


Unnamed: 0,Workclass,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,Income,AgeGroup,Education
0,Private,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,<=50K,20-29,5-9
1,Private,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,<=50K,30-39,5-9
2,Local-gov,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,>50K,20-29,10-14
3,Private,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,>50K,40-49,10-14
4,Unknown,Never-married,Unknown,Own-child,White,Female,0,0,30,<=50K,10-19,10-14


In [9]:
df_train.columns
len(df_train.columns)
df_train.info()

Index(['Workclass', 'MaritalStatus', 'Occupation', 'Relationship', 'Race',
       'Sex', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'Income',
       'AgeGroup', 'Education'],
      dtype='object')

12

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
Workclass        32561 non-null object
MaritalStatus    32561 non-null object
Occupation       32561 non-null object
Relationship     32561 non-null object
Race             32561 non-null object
Sex              32561 non-null object
CapitalGain      32561 non-null int64
CapitalLoss      32561 non-null int64
HoursPerWeek     32561 non-null int64
Income           32561 non-null object
AgeGroup         32561 non-null category
Education        32561 non-null category
dtypes: category(2), int64(3), object(7)
memory usage: 2.5+ MB


In [10]:
categorical = [
    'AgeGroup',
    'Education',
    'Workclass',
    'MaritalStatus',
    'Occupation',
    'Relationship',
    'Race',
    'Sex',
    'Income'
]

# def to_label(dataframe, categorical=None):
#     from sklearn.preprocessing import LabelEncoder
#     df = dataframe if categorical is None else dataframe[categorical]
#     return df.apply(LabelEncoder().fit_transform)

def to_categorical(dataframe, columns):
    df = dataframe.copy()
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for c in df.columns:
        if c in columns:
            df[c] = le.fit_transform(df[c]).astype('uint16')
    return df

# to_label(df_train, categorical)

In [11]:
# df_train0 = df_train
# df_train = df_train0
df_train = to_categorical(df_train, categorical)

In [12]:
df_train.head(10)
df_train.info()

Unnamed: 0,Workclass,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,Income,AgeGroup,Education
0,6,4,0,1,4,1,2174,0,40,0,2,1
1,5,2,3,0,4,1,0,0,13,0,4,1
2,3,0,5,1,4,1,0,0,40,0,2,3
3,3,2,5,0,2,1,0,0,40,0,4,3
4,3,2,9,5,2,0,0,0,40,0,1,1
5,3,2,3,5,4,0,0,0,40,0,2,1
6,3,3,7,1,2,0,0,0,16,0,3,3
7,5,2,3,0,4,1,0,0,45,1,4,3
8,3,4,9,1,4,0,14084,0,50,1,2,1
9,3,2,3,0,4,1,5178,0,40,1,3,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
Workclass        32561 non-null uint16
MaritalStatus    32561 non-null uint16
Occupation       32561 non-null uint16
Relationship     32561 non-null uint16
Race             32561 non-null uint16
Sex              32561 non-null uint16
CapitalGain      32561 non-null int64
CapitalLoss      32561 non-null int64
HoursPerWeek     32561 non-null int64
Income           32561 non-null uint16
AgeGroup         32561 non-null uint16
Education        32561 non-null uint16
dtypes: int64(3), uint16(9)
memory usage: 1.3 MB


In [13]:
# df_test0 = df_test
# df_test = df_test0
df_test = to_categorical(df_test, categorical)

In [14]:
df_test.head(10)
df_test.info()

Unnamed: 0,Workclass,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,Income,AgeGroup,Education
0,3,4,6,3,2,1,0,0,40,0,1,3
1,3,2,4,0,4,1,0,0,50,0,2,3
2,1,2,10,0,4,1,0,0,40,1,1,1
3,3,2,6,0,2,1,7688,0,40,1,3,1
4,7,4,14,3,4,0,0,0,30,0,0,1
5,3,4,7,1,4,1,0,0,30,0,2,3
6,7,4,14,4,2,1,0,0,40,0,1,3
7,5,2,9,0,4,1,3103,0,32,1,5,2
8,3,4,7,4,4,0,0,0,40,0,1,1
9,3,2,2,0,4,1,0,0,10,0,4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 12 columns):
Workclass        16281 non-null uint16
MaritalStatus    16281 non-null uint16
Occupation       16281 non-null uint16
Relationship     16281 non-null uint16
Race             16281 non-null uint16
Sex              16281 non-null uint16
CapitalGain      16281 non-null int64
CapitalLoss      16281 non-null int64
HoursPerWeek     16281 non-null int64
Income           16281 non-null uint16
AgeGroup         16281 non-null uint16
Education        16281 non-null uint16
dtypes: int64(3), uint16(9)
memory usage: 667.9 KB


In [15]:
y_name = 'Income'
x_names = df_train.columns.drop(y_name).tolist()

y_name
x_names

'Income'

['Workclass',
 'MaritalStatus',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'CapitalGain',
 'CapitalLoss',
 'HoursPerWeek',
 'AgeGroup',
 'Education']

In [16]:
y_train = df_train[y_name].values
x_train = df_train.drop(y_name, axis=1).values

y_test = df_test[y_name].values
x_test = df_test.drop(y_name, axis=1).values

In [17]:
categorical.pop(categorical.index(y_name))
categorical

'Income'

['AgeGroup',
 'Education',
 'Workclass',
 'MaritalStatus',
 'Occupation',
 'Relationship',
 'Race',
 'Sex']

In [18]:
x_train
y_train
x_test
y_test

array([[ 6,  4,  0, ..., 40,  2,  1],
       [ 5,  2,  3, ..., 13,  4,  1],
       [ 3,  0,  5, ..., 40,  2,  3],
       ...,
       [ 3,  6,  0, ..., 40,  4,  3],
       [ 3,  4,  0, ..., 20,  1,  3],
       [ 4,  2,  3, ..., 40,  4,  3]])

array([0, 0, 0, ..., 0, 0, 1], dtype=uint16)

array([[ 3,  4,  6, ..., 40,  1,  3],
       [ 3,  2,  4, ..., 50,  2,  3],
       [ 1,  2, 10, ..., 40,  1,  1],
       ...,
       [ 3,  2,  9, ..., 50,  2,  1],
       [ 3,  0,  0, ..., 40,  3,  1],
       [ 4,  2,  3, ..., 60,  2,  1]])

array([0, 0, 1, ..., 0, 0, 1], dtype=uint16)

In [27]:
import lightgbm as lgb

d_train = lgb.Dataset(x_train, y_train, feature_name=x_names, categorical_feature=categorical)
d_test = lgb.Dataset(x_test, y_test, reference=d_train)

d_train
d_test

<lightgbm.basic.Dataset at 0x7ff6b2125eb8>

<lightgbm.basic.Dataset at 0x7ff6b2125588>

In [20]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    
    'seed': 430,
    'max_bin': 255,
    'learning_rate': .05,
    
#     'num_trees': 2000,
    'num_leaves': 127,
#     'min_data_in_leaf': 100,
#     'early_stopping': 100,
    
    'reg_alpha': .001,
    'reg_lambda': 1,

    'bagging_fraction': .9,
    'bagging_freq': 3,
    'feature_fraction': .75,
    
}

scores = lgb.cv(params, d_train, nfold=5, num_boost_round=20, early_stopping_rounds=10)



In [21]:
pd.DataFrame(scores)


Unnamed: 0,auc-mean,auc-stdv,binary_logloss-mean,binary_logloss-stdv
0,0.896009,0.00625,0.5309,0.000629
1,0.906671,0.006418,0.514752,0.001137
2,0.911611,0.005704,0.497859,0.001556
3,0.912497,0.005982,0.483061,0.001944
4,0.912582,0.005979,0.469643,0.00237
5,0.912707,0.006031,0.458876,0.002709
6,0.914049,0.006069,0.447331,0.00302
7,0.914229,0.005916,0.438573,0.003261
8,0.913771,0.005732,0.430574,0.003456
9,0.913482,0.005938,0.423519,0.003667


In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
#     'metric': 'auc',
    
    'seed': 430,
    'learning_rate': 0.01,
    
    'reg_alpha': .001,
    'reg_lambda': 1,

    'bagging_fraction': .9,
    'feature_fraction': .75,
    
}

evals_result = {}
gbm = lgb.train(params, d_train, num_boost_round=3000,
                valid_sets=d_test, valid_names=['test'],
                early_stopping_rounds=100, evals_result=evals_result)

In [None]:
gbm.best_iteration
gbm.best_score

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1,
# iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’,
# error_score=’raise’, return_train_score=’warn’)[

In [None]:
import itertools
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes, normalize=False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cmap = plt.cm.Blues
    title = "Confusion Matrix"
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=3)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [33]:
import hyperopt as ho

In [None]:

# class Foo(object):
#     def __init__(self, features, target
#                 ):
#         params = {
#             'boosting_type': 'gbdt',
#             'objective': 'binary',
#             'metric': ['binary_logloss', 'auc'],
#             'seed': 430,
#         }

#     def f(params_space):

#         lgb.cv(params.update(params_space), )
    

In [99]:
%%time

d_train = lgb.Dataset(x_train, y_train, feature_name=x_names, categorical_feature=categorical, free_raw_data=False)
# d_test = lgb.Dataset(x_test, y_test, reference=d_train)

from functools import partial

def func_cv(params_space):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'seed': 430,
    }
    params.update(params_space)
    scores = lgb.cv(params, d_train, nfold=6)
    return scores


def func_min(fn, space):
    s = fn(space)
    return np.min(s.get('binary_logloss-mean'))


params_space = {
    'max_bin': ho.hp.choice('max_bin', [255, 127, 63]),
    'learning_rate': ho.hp.uniform('learning_rate', .001, .3),
    
    'num_leaves': ho.hp.choice('num_leaves', [127, 63, 31]),
    
    'reg_alpha': ho.hp.uniform('reg_alpha', .005, .2),
    'reg_lambda': ho.hp.uniform('reg_lambda', .1, .6),

#     'bagging_fraction': ho.hp.uniform('bagging_fraction', .5, 1.),
#     'bagging_freq': ho.hp.choice('bagging_freq', [1, 2, 3, 4, 5]),
    
    'feature_fraction': ho.hp.uniform('feature_fraction', .5, 1.),
}

trials = ho.Trials()

best = ho.fmin(
    partial(func_min, func_cv),
    space=params_space,
    algo=ho.tpe.suggest, 
    max_evals=200,
    trials=trials,
    rstate=np.random.RandomState(430)
)

# print(params_space)



CPU times: user 30min 1s, sys: 27.9 s, total: 30min 29s
Wall time: 8min 24s


In [100]:
from pprint import pprint as pp

params_best = ho.space_eval(params_space, best)
pp(params_best)
pd.DataFrame(func_cv(params_best))

{'feature_fraction': 0.5001306578388296,
 'learning_rate': 0.2881638907334713,
 'max_bin': 255,
 'num_leaves': 31,
 'reg_alpha': 0.11987032076957657,
 'reg_lambda': 0.12012820361026232}




Unnamed: 0,binary_logloss-mean,binary_logloss-stdv
0,0.447102,0.003840
1,0.410705,0.004437
2,0.376251,0.005511
3,0.354867,0.006214
4,0.338284,0.006607
5,0.327271,0.007305
6,0.316971,0.007720
7,0.312110,0.008093
8,0.307895,0.008175
9,0.304836,0.008479


In [74]:
# ho.pyll.stochastic.sample(params_space)

{'feature_fraction': 0.858208745254183,
 'learning_rate': 0.09931705464961389,
 'max_bin': 63,
 'num_leaves': 127,
 'reg_alpha': 0.04300464190621706,
 'reg_lambda': 0.2826587492401653}

In [98]:
# from pprint import pprint as pp
# for t in trials.trials:
#     pp(t)

{'book_time': datetime.datetime(2018, 7, 20, 13, 50, 20, 280000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
          'idxs': {'bagging_fraction': [0],
                   'bagging_freq': [0],
                   'feature_fraction': [0],
                   'learning_rate': [0],
                   'max_bin': [0],
                   'num_leaves': [0],
                   'reg_alpha': [0],
                   'reg_lambda': [0]},
          'tid': 0,
          'vals': {'bagging_fraction': [0.6751659580961454],
                   'bagging_freq': [2],
                   'feature_fraction': [0.9664607813406715],
                   'learning_rate': [0.013456500807992579],
                   'max_bin': [0],
                   'num_leaves': [1],
                   'reg_alpha': [0.11262540945953152],
                   'reg_lambda': [0.32513724929475074]},
          'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2018, 7, 20, 13, 50, 22, 593000),

In [107]:
d_train = lgb.Dataset(x_train, y_train, feature_name=x_names, categorical_feature=categorical)
d_test = lgb.Dataset(x_test, y_test, reference=d_train)

params_best = ho.space_eval(params_space, best)
params = dict(**params_best, **{
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    'seed': 430,
})

scores = lgb.cv(params, d_train, nfold=5, num_boost_round=1000, early_stopping_rounds=20)
pd.DataFrame(scores)



Unnamed: 0,auc-mean,auc-stdv,binary_logloss-mean,binary_logloss-stdv
0,0.889332,0.006206,0.447191,0.003017
1,0.90245,0.005538,0.410873,0.003451
2,0.906406,0.004678,0.376472,0.004285
3,0.914219,0.005494,0.355118,0.00499
4,0.916281,0.004964,0.338611,0.005214
5,0.917507,0.005535,0.327699,0.006075
6,0.918322,0.006006,0.3174,0.007007
7,0.918907,0.005928,0.312522,0.007266
8,0.919666,0.005605,0.308254,0.007291
9,0.920318,0.005595,0.305248,0.00753


In [108]:
from sklearn.model_selection import train_test_split

x0, x1, y0, y1 = train_test_split(x_train, y_train, train_size=.8, random_state=430)

d_train = lgb.Dataset(x0, y0, feature_name=x_names, categorical_feature=categorical)
d_valid = lgb.Dataset(x1, y1, reference=d_train)
# d_test = lgb.Dataset(x_test, y_test, reference=d_train)





In [110]:
model = lgb.train(params, d_train, num_boost_round=50, valid_sets=d_valid, valid_names='valid')



[1]	valid's auc: 0.890681	valid's binary_logloss: 0.445437
[2]	valid's auc: 0.903828	valid's binary_logloss: 0.408681
[3]	valid's auc: 0.907508	valid's binary_logloss: 0.3745
[4]	valid's auc: 0.915302	valid's binary_logloss: 0.352839
[5]	valid's auc: 0.916771	valid's binary_logloss: 0.337172
[6]	valid's auc: 0.917643	valid's binary_logloss: 0.326646
[7]	valid's auc: 0.918986	valid's binary_logloss: 0.316227
[8]	valid's auc: 0.919388	valid's binary_logloss: 0.311633
[9]	valid's auc: 0.919747	valid's binary_logloss: 0.307595
[10]	valid's auc: 0.920227	valid's binary_logloss: 0.304779
[11]	valid's auc: 0.920395	valid's binary_logloss: 0.302221
[12]	valid's auc: 0.921409	valid's binary_logloss: 0.298096
[13]	valid's auc: 0.921627	valid's binary_logloss: 0.296539
[14]	valid's auc: 0.922291	valid's binary_logloss: 0.294112
[15]	valid's auc: 0.922958	valid's binary_logloss: 0.291741
[16]	valid's auc: 0.923342	valid's binary_logloss: 0.290767
[17]	valid's auc: 0.923658	valid's binary_logloss: 

In [111]:
y_pred = model.predict(x_test)

In [112]:
y_pred

array([0.00096255, 0.12065365, 0.4335417 , ..., 0.68703336, 0.18318265,
       0.69140384])

In [113]:
from sklearn.metrics import log_loss
log_loss(y_test, y_pred)

0.2814861795096843

In [114]:
1-.28

0.72

In [115]:
params

{'feature_fraction': 0.5001306578388296,
 'learning_rate': 0.2881638907334713,
 'max_bin': 255,
 'num_leaves': 31,
 'reg_alpha': 0.11987032076957657,
 'reg_lambda': 0.12012820361026232,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': ['binary_logloss', 'auc'],
 'seed': 430,
 'verbose': 1,
 'categorical_column': [0, 1, 2, 3, 4, 5, 9, 10]}

In [116]:
sklearn_params = {
    'seed': 430,
    'n_estimators': 50,
    'learning_rate': 0.2881638907334713,
    'colsample_bytree': 0.5001306578388296,
    'max_bin': 255,
    'num_leaves': 31,
    'reg_alpha': 0.11987032076957657,
    'reg_lambda': 0.12012820361026232,
}
skmodel = lgb.LGBMClassifier(**sklearn_params)
skmodel.fit(x_train, y_train)
skmodel.score(x_test, y_test)

LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.5001306578388296,
        learning_rate=0.2881638907334713, max_bin=255, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=50, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.11987032076957657,
        reg_lambda=0.12012820361026232, seed=430, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

  if diff:


0.8708924513236288