In [1]:
# install packages if not installed yet
import sys
!{sys.executable} -m pip install lightgbm optuna



In [18]:
# imports

# data manipulation
import pandas as pd
import numpy as np

# model evaluation
from sklearn.metrics import f1_score

# lightgbm
import lightgbm as lgb
# optuna is the tuning automation tool
import optuna
import optuna.integration.lightgbm as optunalgb

In [3]:
# load data
train_file = './data/train.csv'
train_data = pd.read_csv(train_file)
train_data = pd.DataFrame(data=train_data)

test_file = './data/test.csv'
test_data = pd.read_csv(test_file)
test_data = pd.DataFrame(data=test_data)

test_ground_truths_file = './data/test_ground_truths.csv'
test_ground_truths = pd.read_csv(test_ground_truths_file)
test_ground_truths = pd.DataFrame(data=test_ground_truths)

test_data['exceeds50K'] = test_ground_truths

In [4]:
#prepare dataset
def convertCategoricalValuesToInt(df, columns):
    output = df.copy();
    
    output[columns] = output[columns].astype('category')
    cat_columns = output.select_dtypes(['category']).columns
    output[cat_columns] = output[cat_columns].apply(lambda x: x.cat.codes)
    
    return output;

train_data_lgb = train_data.drop(['native-country', 'education', 'fnlwgt'], axis=1)
test_data_lgb = test_data.drop(['native-country', 'education', 'fnlwgt'], axis=1)

x_train = train_data_lgb.drop('exceeds50K', axis=1)
y_train = train_data_lgb['exceeds50K']
x_test = test_data_lgb.drop('exceeds50K', axis=1)
y_test = test_data_lgb['exceeds50K']

features = x_train.columns
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'sex']
categorical_features_numbered = [c for c, col in enumerate(features) if col in categorical_features]

x_train = convertCategoricalValuesToInt(x_train, categorical_features)
x_test = convertCategoricalValuesToInt(x_test, categorical_features)
print(x_train.head())

x_train = optunalgb.Dataset(x_train, label=y_train, categorical_feature=categorical_features_numbered, free_raw_data=False)
x_test = optunalgb.Dataset(x_test, label=y_test, free_raw_data=False)

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', round(f1_score(y_true, y_hat, average = 'weighted') * 100, 2), True

   age  workclass  education-num  marital-status  occupation  relationship  \
0   30          0             10               4           0             3   
1   60          4             13               0          10             1   
2   52          0             10               2           0             0   
3   37          4             13               2          12             0   
4   63          4             10               2          12             0   

   sex  capital-gain  capital-loss  hours-per-week  
0    0             0             0              30  
1    0             0             0              42  
2    1             0             0              12  
3    1             0             0              60  
4    1          7298             0              48  


In [22]:
# tune with optunalgb
parameters = {
    'metric': 'f1',
    'learning_rate': 0.001,
    'verbose': 0,
}

evals_result = {}
best_params, tuning_history = dict(), list()
model = optunalgb.train(parameters,
                       x_train,
                       valid_sets=x_test,
                       num_boost_round=5000,
                       early_stopping_rounds=1000,
                       feval=lgb_f1_score,
                       evals_result=evals_result,
                       best_params=best_params,
                       tuning_history=tuning_history,
                       verbose_eval=500)
lgb.plot_metric(evals_result, metric='f1')
print(best_params)







  0%|                                                                                            | 0/7 [00:00<?, ?it/s]





tune_feature_fraction, val_score: inf:   0%|                                                     | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.59
[1000]	valid_0's f1: 86.8
Did not meet early stopping. Best iteration is:
[956]	valid_0's f1: 86.82








tune_feature_fraction, val_score: 86.820000:   0%|                                               | 0/7 [00:15<?, ?it/s]





tune_feature_fraction, val_score: 86.820000:  14%|#####5                                 | 1/7 [00:15<01:30, 15.16s/it][I 2020-04-26 19:58:11,708] Finished trial#0 with value: 86.82 with parameters: {'feature_fraction': 0.4}. Best is trial#0 with value: 86.82.






tune_feature_fraction, val_score: 86.820000:  14%|#####5                                 | 1/7 [00:15<01:30, 15.16s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.55
[1000]	valid_0's f1: 86.81
Did not meet early stopping. Best iteration is:
[942]	valid_0's f1: 86.81








tune_feature_fraction, val_score: 86.810000:  14%|#####5                                 | 1/7 [00:35<01:30, 15.16s/it]





tune_feature_fraction, val_score: 86.810000:  29%|###########1                           | 2/7 [00:35<01:23, 16.77s/it][I 2020-04-26 19:58:32,221] Finished trial#1 with value: 86.81 with parameters: {'feature_fraction': 0.5}. Best is trial#1 with value: 86.81.






tune_feature_fraction, val_score: 86.810000:  29%|###########1                           | 2/7 [00:35<01:23, 16.77s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.65
[1000]	valid_0's f1: 86.8
Did not meet early stopping. Best iteration is:
[944]	valid_0's f1: 86.84








tune_feature_fraction, val_score: 86.810000:  29%|###########1                           | 2/7 [00:50<01:23, 16.77s/it]





tune_feature_fraction, val_score: 86.810000:  43%|################7                      | 3/7 [00:50<01:05, 16.32s/it][I 2020-04-26 19:58:47,516] Finished trial#2 with value: 86.84 with parameters: {'feature_fraction': 0.6}. Best is trial#1 with value: 86.81.






tune_feature_fraction, val_score: 86.810000:  43%|################7                      | 3/7 [00:51<01:05, 16.32s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.61
[1000]	valid_0's f1: 86.78
Did not meet early stopping. Best iteration is:
[955]	valid_0's f1: 86.78








tune_feature_fraction, val_score: 86.780000:  43%|################7                      | 3/7 [01:06<01:05, 16.32s/it]





tune_feature_fraction, val_score: 86.780000:  57%|######################2                | 4/7 [01:06<00:48, 16.04s/it][I 2020-04-26 19:59:02,890] Finished trial#3 with value: 86.78 with parameters: {'feature_fraction': 0.7}. Best is trial#3 with value: 86.78.






tune_feature_fraction, val_score: 86.780000:  57%|######################2                | 4/7 [01:06<00:48, 16.04s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.61
[1000]	valid_0's f1: 86.71
Did not meet early stopping. Best iteration is:
[809]	valid_0's f1: 86.75








tune_feature_fraction, val_score: 86.750000:  57%|######################2                | 4/7 [01:21<00:48, 16.04s/it]





tune_feature_fraction, val_score: 86.750000:  71%|###########################8           | 5/7 [01:21<00:31, 15.86s/it][I 2020-04-26 19:59:18,329] Finished trial#4 with value: 86.75 with parameters: {'feature_fraction': 0.8}. Best is trial#4 with value: 86.75.






tune_feature_fraction, val_score: 86.750000:  71%|###########################8           | 5/7 [01:21<00:31, 15.86s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.61
Early stopping, best iteration is:
[768]	valid_0's f1: 86.74








tune_feature_fraction, val_score: 86.740000:  71%|###########################8           | 5/7 [01:36<00:31, 15.86s/it]





tune_feature_fraction, val_score: 86.740000:  86%|#################################4     | 6/7 [01:36<00:15, 15.58s/it][I 2020-04-26 19:59:33,260] Finished trial#5 with value: 86.74 with parameters: {'feature_fraction': 0.8999999999999999}. Best is trial#5 with value: 86.74.






tune_feature_fraction, val_score: 86.740000:  86%|#################################4     | 6/7 [01:36<00:15, 15.58s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.52
Early stopping, best iteration is:
[592]	valid_0's f1: 86.55








tune_feature_fraction, val_score: 86.550000:  86%|#################################4     | 6/7 [01:48<00:15, 15.58s/it]





tune_feature_fraction, val_score: 86.550000: 100%|#######################################| 7/7 [01:48<00:00, 14.57s/it][I 2020-04-26 19:59:45,461] Finished trial#6 with value: 86.55 with parameters: {'feature_fraction': 1.0}. Best is trial#6 with value: 86.55.
tune_feature_fraction, val_score: 86.550000: 100%|#######################################| 7/7 [01:49<00:00, 15.57s/it]






  0%|                                                                                           | 0/20 [00:00<?, ?it/s]





tune_num_leaves, val_score: 86.550000:   0%|                                                    | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 85.81
[1000]	valid_0's f1: 86.3
Did not meet early stopping. Best iteration is:
[989]	valid_0's f1: 86.31








tune_num_leaves, val_score: 86.310000:   0%|                                                    | 0/20 [00:13<?, ?it/s]





tune_num_leaves, val_score: 86.310000:   5%|##2                                         | 1/20 [00:13<04:25, 13.96s/it][I 2020-04-26 19:59:59,514] Finished trial#0 with value: 86.31 with parameters: {'num_leaves': 9}. Best is trial#0 with value: 86.31.






tune_num_leaves, val_score: 86.310000:   5%|##2                                         | 1/20 [00:14<04:25, 13.96s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 85.05
[1000]	valid_0's f1: 85.48
Did not meet early stopping. Best iteration is:
[947]	valid_0's f1: 85.48








tune_num_leaves, val_score: 85.480000:   5%|##2                                         | 1/20 [00:30<04:25, 13.96s/it]





tune_num_leaves, val_score: 85.480000:  10%|####4                                       | 2/20 [00:30<04:26, 14.83s/it][I 2020-04-26 20:00:16,379] Finished trial#1 with value: 85.48 with parameters: {'num_leaves': 4}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  10%|####4                                       | 2/20 [00:30<04:26, 14.83s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.44
[1000]	valid_0's f1: 86.56
Did not meet early stopping. Best iteration is:
[958]	valid_0's f1: 86.58








tune_num_leaves, val_score: 85.480000:  10%|####4                                       | 2/20 [00:48<04:26, 14.83s/it]





tune_num_leaves, val_score: 85.480000:  15%|######6                                     | 3/20 [00:48<04:27, 15.75s/it][I 2020-04-26 20:00:34,259] Finished trial#2 with value: 86.58 with parameters: {'num_leaves': 79}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  15%|######6                                     | 3/20 [00:48<04:27, 15.75s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.45
[1000]	valid_0's f1: 86.58
Did not meet early stopping. Best iteration is:
[946]	valid_0's f1: 86.61








tune_num_leaves, val_score: 85.480000:  15%|######6                                     | 3/20 [01:07<04:27, 15.75s/it]





tune_num_leaves, val_score: 85.480000:  20%|########8                                   | 4/20 [01:07<04:24, 16.51s/it][I 2020-04-26 20:00:52,554] Finished trial#3 with value: 86.61 with parameters: {'num_leaves': 90}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  20%|########8                                   | 4/20 [01:07<04:24, 16.51s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.33
Early stopping, best iteration is:
[625]	valid_0's f1: 86.44








tune_num_leaves, val_score: 85.480000:  20%|########8                                   | 4/20 [01:23<04:24, 16.51s/it]





tune_num_leaves, val_score: 85.480000:  25%|###########                                 | 5/20 [01:23<04:07, 16.53s/it][I 2020-04-26 20:01:09,131] Finished trial#4 with value: 86.44 with parameters: {'num_leaves': 129}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  25%|###########                                 | 5/20 [01:23<04:07, 16.53s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.3
Early stopping, best iteration is:
[520]	valid_0's f1: 86.39








tune_num_leaves, val_score: 85.480000:  25%|###########                                 | 5/20 [01:38<04:07, 16.53s/it]





tune_num_leaves, val_score: 85.480000:  30%|#############2                              | 6/20 [01:38<03:45, 16.11s/it][I 2020-04-26 20:01:24,277] Finished trial#5 with value: 86.39 with parameters: {'num_leaves': 149}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  30%|#############2                              | 6/20 [01:38<03:45, 16.11s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.11
Early stopping, best iteration is:
[541]	valid_0's f1: 86.15








tune_num_leaves, val_score: 85.480000:  30%|#############2                              | 6/20 [01:56<03:45, 16.11s/it]





tune_num_leaves, val_score: 85.480000:  35%|###############4                            | 7/20 [01:56<03:34, 16.51s/it][I 2020-04-26 20:01:41,703] Finished trial#6 with value: 86.15 with parameters: {'num_leaves': 210}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  35%|###############4                            | 7/20 [01:56<03:34, 16.51s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.27
Early stopping, best iteration is:
[722]	valid_0's f1: 86.34








tune_num_leaves, val_score: 85.480000:  35%|###############4                            | 7/20 [02:15<03:34, 16.51s/it]





tune_num_leaves, val_score: 85.480000:  40%|#################6                          | 8/20 [02:15<03:28, 17.34s/it][I 2020-04-26 20:02:00,996] Finished trial#7 with value: 86.34 with parameters: {'num_leaves': 147}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  40%|#################6                          | 8/20 [02:15<03:28, 17.34s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.57
[1000]	valid_0's f1: 86.67
Did not meet early stopping. Best iteration is:
[957]	valid_0's f1: 86.67








tune_num_leaves, val_score: 85.480000:  40%|#################6                          | 8/20 [02:31<03:28, 17.34s/it]





tune_num_leaves, val_score: 85.480000:  45%|###################8                        | 9/20 [02:31<03:05, 16.85s/it][I 2020-04-26 20:02:16,695] Finished trial#8 with value: 86.67 with parameters: {'num_leaves': 34}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  45%|###################8                        | 9/20 [02:31<03:05, 16.85s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.21
Early stopping, best iteration is:
[632]	valid_0's f1: 86.23








tune_num_leaves, val_score: 85.480000:  45%|###################8                        | 9/20 [02:49<03:05, 16.85s/it]





tune_num_leaves, val_score: 85.480000:  50%|#####################5                     | 10/20 [02:49<02:53, 17.36s/it][I 2020-04-26 20:02:35,235] Finished trial#9 with value: 86.23 with parameters: {'num_leaves': 183}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  50%|#####################5                     | 10/20 [02:49<02:53, 17.36s/it]

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[271]	valid_0's f1: 86








tune_num_leaves, val_score: 85.480000:  50%|#####################5                     | 10/20 [03:01<02:53, 17.36s/it]





tune_num_leaves, val_score: 85.480000:  55%|#######################6                   | 11/20 [03:01<02:20, 15.64s/it][I 2020-04-26 20:02:46,863] Finished trial#10 with value: 86.0 with parameters: {'num_leaves': 245}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  55%|#######################6                   | 11/20 [03:01<02:20, 15.64s/it]

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[291]	valid_0's f1: 86.06








tune_num_leaves, val_score: 85.480000:  55%|#######################6                   | 11/20 [03:13<02:20, 15.64s/it]





tune_num_leaves, val_score: 85.480000:  60%|#########################8                 | 12/20 [03:13<01:56, 14.56s/it][I 2020-04-26 20:02:58,886] Finished trial#11 with value: 86.06 with parameters: {'num_leaves': 240}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  60%|#########################8                 | 12/20 [03:13<01:56, 14.56s/it]

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[271]	valid_0's f1: 86








tune_num_leaves, val_score: 85.480000:  60%|#########################8                 | 12/20 [03:24<01:56, 14.56s/it]





tune_num_leaves, val_score: 85.480000:  65%|###########################9               | 13/20 [03:24<01:35, 13.64s/it][I 2020-04-26 20:03:10,406] Finished trial#12 with value: 86.0 with parameters: {'num_leaves': 245}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  65%|###########################9               | 13/20 [03:24<01:35, 13.64s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.45
Early stopping, best iteration is:
[644]	valid_0's f1: 86.54








tune_num_leaves, val_score: 85.480000:  65%|###########################9               | 13/20 [03:39<01:35, 13.64s/it]





tune_num_leaves, val_score: 85.480000:  70%|##############################1            | 14/20 [03:39<01:23, 13.92s/it][I 2020-04-26 20:03:24,964] Finished trial#13 with value: 86.54 with parameters: {'num_leaves': 68}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  70%|##############################1            | 14/20 [03:39<01:23, 13.92s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.15
Early stopping, best iteration is:
[368]	valid_0's f1: 86.2








tune_num_leaves, val_score: 85.480000:  70%|##############################1            | 14/20 [03:52<01:23, 13.92s/it]





tune_num_leaves, val_score: 85.480000:  75%|################################2          | 15/20 [03:52<01:07, 13.60s/it][I 2020-04-26 20:03:37,812] Finished trial#14 with value: 86.2 with parameters: {'num_leaves': 192}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  75%|################################2          | 15/20 [03:52<01:07, 13.60s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.3
[1000]	valid_0's f1: 86.59
Did not meet early stopping. Best iteration is:
[991]	valid_0's f1: 86.59








tune_num_leaves, val_score: 85.480000:  75%|################################2          | 15/20 [04:06<01:07, 13.60s/it]





tune_num_leaves, val_score: 85.480000:  80%|##################################4        | 16/20 [04:06<00:55, 13.91s/it][I 2020-04-26 20:03:52,446] Finished trial#15 with value: 86.59 with parameters: {'num_leaves': 16}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  80%|##################################4        | 16/20 [04:06<00:55, 13.91s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.49
Early stopping, best iteration is:
[783]	valid_0's f1: 86.56








tune_num_leaves, val_score: 85.480000:  80%|##################################4        | 16/20 [04:23<00:55, 13.91s/it]





tune_num_leaves, val_score: 85.480000:  85%|####################################5      | 17/20 [04:23<00:44, 14.75s/it][I 2020-04-26 20:04:09,166] Finished trial#16 with value: 86.56 with parameters: {'num_leaves': 52}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  85%|####################################5      | 17/20 [04:23<00:44, 14.75s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.51
Early stopping, best iteration is:
[613]	valid_0's f1: 86.6








tune_num_leaves, val_score: 85.480000:  85%|####################################5      | 17/20 [04:39<00:44, 14.75s/it]





tune_num_leaves, val_score: 85.480000:  90%|######################################7    | 18/20 [04:39<00:30, 15.01s/it][I 2020-04-26 20:04:24,778] Finished trial#17 with value: 86.6 with parameters: {'num_leaves': 104}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  90%|######################################7    | 18/20 [04:39<00:30, 15.01s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 85.97
Early stopping, best iteration is:
[427]	valid_0's f1: 86.02








tune_num_leaves, val_score: 85.480000:  90%|######################################7    | 18/20 [04:55<00:30, 15.01s/it]





tune_num_leaves, val_score: 85.480000:  95%|########################################8  | 19/20 [04:55<00:15, 15.26s/it][I 2020-04-26 20:04:40,616] Finished trial#18 with value: 86.02 with parameters: {'num_leaves': 256}. Best is trial#1 with value: 85.48.






tune_num_leaves, val_score: 85.480000:  95%|########################################8  | 19/20 [04:55<00:15, 15.26s/it]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 86.04
Early stopping, best iteration is:
[537]	valid_0's f1: 86.1








tune_num_leaves, val_score: 85.480000:  95%|########################################8  | 19/20 [05:12<00:15, 15.26s/it]





tune_num_leaves, val_score: 85.480000: 100%|###########################################| 20/20 [05:12<00:00, 15.91s/it][I 2020-04-26 20:04:58,049] Finished trial#19 with value: 86.1 with parameters: {'num_leaves': 220}. Best is trial#1 with value: 85.48.
tune_num_leaves, val_score: 85.480000: 100%|###########################################| 20/20 [05:12<00:00, 15.63s/it]






  0%|                                                                                           | 0/10 [00:00<?, ?it/s]





tune_bagging_fraction_and_bagging_freq, val_score: 85.480000:   0%|                             | 0/10 [00:00<?, ?it/s]

Training until validation scores don't improve for 200 rounds
[500]	valid_0's f1: 85.1


KeyboardInterrupt: 

In [25]:
parameters = {**parameters, **best_params}
print(parameters)
evals_result = {}
model = lgb.train(parameters,
                       x_train,
                       valid_sets=x_test,
                       num_boost_round=10000,
                       early_stopping_rounds=10000,
                       feval=lgb_f1_score,
                       evals_result=evals_result,
                       verbose_eval=100)
lgb.plot_metric(evals_result, metric='f1')

{'metric': 'f1', 'learning_rate': 0.01, 'verbose': 0, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 4, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}
Training until validation scores don't improve for 10000 rounds
[100]	valid_0's f1: 81.9
[200]	valid_0's f1: 83.49
[300]	valid_0's f1: 84.22
[400]	valid_0's f1: 84.68
[500]	valid_0's f1: 85.05
[600]	valid_0's f1: 85.17
[700]	valid_0's f1: 85.29


KeyboardInterrupt: 