# Predictions blender

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from glob import glob
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

import shutil

# Project
project_common_path = os.path.dirname('.')
project_common_path = os.path.abspath(os.path.join(project_common_path, '..', 'common'))
if not project_common_path in sys.path:
    sys.path.append(project_common_path)

In [3]:
os.environ['THEANO_FLAGS'] = 'device=cpu'

import numpy as np
import pandas as pd

from data_utils import get_id_type_list_for_class, GENERATED_DATA, OUTPUT_PATH
from test_utils import create_submission

Using Theano backend.


### Load predictions on trainval dataset

In [13]:
from data_utils import unique_tags, get_label

target_tags = ['target_' + t for t in unique_tags]
val_predictions_filepath = os.path.join(GENERATED_DATA, "val_predictions_ResNet50_2017-07-18-14-50.csv")
if not os.path.exists(val_predictions_filepath):
    
    val_predictions_csv = glob(os.path.join(OUTPUT_PATH, "val_predictions_*2017-07-18-14-50.csv"))
    df = pd.read_csv(val_predictions_csv[0]).dropna()
    for filepath in val_predictions_csv[1:]:
        df = pd.concat([df, pd.read_csv(filepath).dropna()])
    df.reset_index(inplace=True)   
    df.drop('index', axis=1, inplace=True)
    df['image_id'] = df['image_name'].apply(lambda x: int(x[len('train_'):]))    
    for t in target_tags:
        df[t] = ''
    def fill_target_tags(row):
        image_id = row[0]
        labels = get_label(image_id, "Train_jpg")
        row[1:] = labels    
        return row
    cols = ['image_id', ] + target_tags
    df[cols] = df[cols].apply(fill_target_tags, axis=1)
    
    df.to_csv(val_predictions_filepath, index=False)
    val_predictions_df = df
    df = None    
else:
    val_predictions_df = pd.read_csv(val_predictions_filepath)

In [14]:
print(len(val_predictions_df))
val_predictions_df.head()

40448


Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
0,train_18872,0.762381,0.007814,0.023866,0.00248,0.001474,0.983301,0.00037,0.002931,0.932843,...,0,1,1,0,0,1,0,0,1,1
1,train_10562,0.106635,0.000834,0.002179,0.00149,0.001146,0.000112,0.003351,0.001014,0.020308,...,0,0,0,0,1,1,0,0,0,0
2,train_28763,0.196588,0.004786,0.009777,0.002631,0.001869,0.019417,0.105915,0.004077,0.084854,...,0,1,1,0,1,1,0,0,0,0
3,train_31735,0.084816,0.001532,0.013537,0.010144,0.014883,0.633817,0.002344,0.002039,0.045785,...,0,0,0,0,0,1,0,0,0,0
4,train_320,0.106061,0.001101,0.004905,0.004541,0.001737,0.000195,0.00081,0.001513,0.102018,...,0,0,0,0,1,1,0,0,0,0


## Simple probability mean 

In [15]:
predictions_csv = glob(os.path.join(OUTPUT_PATH, "predictions_*2017-07-18-09-28.csv"))
predictions_csv

['/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_1_ResNet50_all_classes_fold=0_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_2_ResNet50_all_classes_fold=1_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_3_ResNet50_all_classes_fold=2_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_4_ResNet50_all_classes_fold=3_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_5_ResNet50_all_classes_fold=4_seed=2017_2017-07-18-09-28.csv']

In [16]:
predictions = []
for csv_filepath in predictions_csv:
    predictions.append(pd.read_csv(csv_filepath))

In [25]:
def compute_mean(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.mean).reset_index()
    return df2

def compute_median(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.median).reset_index()
    return df2


def compute_max(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.max).reset_index()
    return df2


In [26]:
mean_predictions_df = compute_mean(predictions)
max_predictions_df = compute_max(predictions)

In [19]:
mean_predictions_df.head(10)

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,file_0,0.00788,0.000258,0.001856,0.012806,0.001243,0.978633,0.00083,0.000275,0.004255,0.001517,0.017801,0.002503,0.999501,0.007137,0.003741,0.000324,0.007457
1,file_1,0.020115,0.001063,0.003271,0.026395,0.007789,0.987504,0.001078,0.001102,0.014232,0.004781,0.002344,0.011588,0.998687,0.010648,0.019523,0.00207,0.020553
2,file_10,0.11247,0.00045,0.004199,0.000526,0.000178,0.004071,0.001011,0.00037,0.011547,0.002038,0.000553,0.992372,0.997895,0.007432,0.000933,0.001882,0.118545
3,file_100,0.365171,0.000328,0.001163,0.001057,0.000369,0.995261,4.6e-05,0.000182,0.354873,0.004199,0.000893,0.006364,0.999863,0.748615,0.02797,0.003295,0.637615
4,file_1000,0.988615,0.000314,0.00387,0.003703,0.000922,0.986524,0.000402,0.002526,0.209117,0.028993,0.010647,0.017201,0.995287,0.955292,0.004538,0.005714,0.058605
5,file_10000,0.039525,0.000508,0.001962,0.011408,0.001495,0.983903,0.000373,0.000315,0.018405,0.001681,0.001727,0.026885,0.999423,0.012386,0.005319,0.002857,0.087368
6,file_10001,0.103843,0.00113,0.00466,0.002583,0.001242,0.000138,0.002481,0.00053,0.045527,0.014029,0.001584,0.999695,0.994804,0.027815,0.001338,0.001183,0.042976
7,file_10002,0.098677,0.002602,0.106802,0.002422,0.001817,0.699248,0.293757,0.002596,0.01815,0.024074,0.06782,0.016571,0.195061,0.039389,0.001592,0.005538,0.560655
8,file_10003,0.285795,0.003092,0.040956,0.000294,0.000522,0.034553,0.281176,0.001016,0.018293,0.026965,0.006018,0.536172,0.427658,0.423495,0.000207,0.00067,0.280522
9,file_10004,0.001879,6.1e-05,0.000145,0.013054,0.000623,0.998828,0.000128,4.9e-05,0.001149,0.00046,0.000169,0.000607,0.999841,0.001019,0.000681,8.7e-05,0.003068


In [27]:
max_predictions_df.head(10)

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,file_0,0.009036,0.000495,0.002967,0.029149,0.002434,0.983726,0.001023,0.000485,0.004963,0.002505,0.027288,0.003507,0.999709,0.011504,0.004574,0.001014,0.010129
1,file_1,0.034911,0.001875,0.007057,0.030708,0.011607,0.993644,0.001772,0.00216,0.02484,0.008911,0.005143,0.022563,0.99951,0.018261,0.034651,0.004081,0.0368
2,file_10,0.242529,0.001535,0.01042,0.001093,0.000376,0.008199,0.002266,0.00113,0.016095,0.005071,0.002119,0.997557,0.999244,0.012457,0.00234,0.005815,0.219308
3,file_100,0.428788,0.000656,0.001231,0.001397,0.000473,0.997431,6.3e-05,0.000233,0.439945,0.006494,0.001355,0.007957,0.999935,0.818811,0.041106,0.004647,0.818868
4,file_1000,0.995154,0.000688,0.008054,0.006888,0.001364,0.993019,0.000556,0.005031,0.261818,0.045594,0.016118,0.023464,0.997665,0.979331,0.007864,0.008026,0.09331
5,file_10000,0.069736,0.001407,0.006063,0.031964,0.005286,0.993812,0.00097,0.000748,0.043473,0.003677,0.003878,0.077834,0.999947,0.032763,0.011557,0.007416,0.176329
6,file_10001,0.133669,0.001968,0.006283,0.003742,0.001748,0.000236,0.003853,0.000783,0.053779,0.019863,0.002297,0.999852,0.998757,0.038416,0.001769,0.001524,0.058257
7,file_10002,0.150849,0.003496,0.177924,0.004324,0.002991,0.775003,0.334427,0.005323,0.028246,0.036006,0.113304,0.026344,0.264144,0.057012,0.003689,0.010042,0.641839
8,file_10003,0.408511,0.008234,0.056339,0.000596,0.001078,0.050075,0.323295,0.001555,0.034461,0.039928,0.007236,0.611283,0.531826,0.595301,0.00035,0.001442,0.309536
9,file_10004,0.00255,0.000148,0.000241,0.021417,0.001348,0.999043,0.000183,7.4e-05,0.002053,0.000683,0.000222,0.000862,0.999895,0.001972,0.001115,0.00012,0.004606


In [10]:
thresholds = {
    'agriculture': 0.35,
    'artisinal_mine': 0.35,
    'bare_ground': 0.35,
    'blooming': 0.35,
    'blow_down': 0.1,
    'clear': 0.5,
    'cloudy': 0.35,
    'conventional_mine': 0.35,
    'cultivation': 0.35,
    'habitation': 0.35,
    'haze': 0.35,
    'partly_cloudy': 0.35,
    'primary': 0.5,
    'road': 0.35,
    'selective_logging': 0.35,
    'slash_burn': 0.1,
    'water': 0.35
}

In [11]:
create_submission(mean_predictions_df, info="resnet50_mean_4_folds", thresholds=thresholds)

#### Check if no predictions on 'file_'

In [17]:
# m = mean_predictions_df['image_name'].str.contains('file_') 
# for t in unique_tags:
#     mean_predictions_df.loc[m, t] = 0.0 

In [None]:
# create_submission(mean_predictions_df, info="squeezenet21_mean_3_folds", thresholds=thresholds)

#### Search for better thresholds

In [28]:
from data_utils import unique_tags, get_label, TRAIN_ENC_CL_CSV
from metrics import score

In [29]:
y_true = val_predictions_df[target_tags].values

In [30]:
best_thresholds = {}


def search_best_threshold(y_true, y_preds):
    _thr = 0.5    
    _bs = score(y_true, y_preds > _thr)
    
    thrs = np.arange(0.0, 1.0, 0.01)
    for thr in thrs:
        s = score(y_true, y_preds > thr)
        if s > _bs:
            _bs = s
            _thr = thr      
    return _thr, _bs


for tag_index, tag in enumerate(unique_tags):
    best_thresholds[tag], best_score = search_best_threshold(y_true[:, tag_index], val_predictions_df[unique_tags].values[:, tag_index])
    print("%s | best threshold : %f with score: %f" % (tag, best_thresholds[tag], best_score))


agriculture | best threshold : 0.140000 with score: 0.887135
artisinal_mine | best threshold : 0.130000 with score: 0.770059
bare_ground | best threshold : 0.050000 with score: 0.441627
blooming | best threshold : 0.040000 with score: 0.279465
blow_down | best threshold : 0.020000 with score: 0.137615
clear | best threshold : 0.230000 with score: 0.975645
cloudy | best threshold : 0.170000 with score: 0.885905
conventional_mine | best threshold : 0.070000 with score: 0.480226
cultivation | best threshold : 0.110000 with score: 0.663881
habitation | best threshold : 0.140000 with score: 0.747220
haze | best threshold : 0.120000 with score: 0.754802
partly_cloudy | best threshold : 0.140000 with score: 0.927035
primary | best threshold : 0.210000 with score: 0.989725
road | best threshold : 0.170000 with score: 0.846593
selective_logging | best threshold : 0.080000 with score: 0.397456
slash_burn | best threshold : 0.030000 with score: 0.190114
water | best threshold : 0.150000 with scor

In [31]:
create_submission(max_predictions_df, info="resnet50_max_5_folds", thresholds=best_thresholds)

Mean F2 score : 0.82186

GMean F2 score : 0.68523

Median F2 score : 0.68836


#### Compare csv submissions

In [35]:
(max_predictions_df[unique_tags] - mean_predictions_df[unique_tags]).head()


Unnamed: 0,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,0.001156,0.000237,0.001111,0.016343,0.001191,0.005093,0.000193,0.00021,0.000709,0.000988,0.009487,0.001005,0.000208,0.004367,0.000832,0.00069,0.002672
1,0.014796,0.000812,0.003786,0.004314,0.003818,0.00614,0.000694,0.001057,0.010608,0.00413,0.0028,0.010974,0.000823,0.007613,0.015128,0.002011,0.016247
2,0.130059,0.001085,0.006221,0.000566,0.000198,0.004128,0.001255,0.00076,0.004547,0.003033,0.001567,0.005185,0.001348,0.005025,0.001407,0.003933,0.100763
3,0.063616,0.000328,6.8e-05,0.00034,0.000105,0.00217,1.7e-05,5.1e-05,0.085072,0.002295,0.000462,0.001593,7.2e-05,0.070196,0.013136,0.001352,0.181253
4,0.006538,0.000374,0.004184,0.003185,0.000442,0.006495,0.000153,0.002505,0.052701,0.016601,0.005471,0.006262,0.002378,0.024039,0.003326,0.002313,0.034706


In [36]:
max_predictions_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,file_0,0.009036,0.000495,0.002967,0.029149,0.002434,0.983726,0.001023,0.000485,0.004963,0.002505,0.027288,0.003507,0.999709,0.011504,0.004574,0.001014,0.010129
1,file_1,0.034911,0.001875,0.007057,0.030708,0.011607,0.993644,0.001772,0.00216,0.02484,0.008911,0.005143,0.022563,0.99951,0.018261,0.034651,0.004081,0.0368
2,file_10,0.242529,0.001535,0.01042,0.001093,0.000376,0.008199,0.002266,0.00113,0.016095,0.005071,0.002119,0.997557,0.999244,0.012457,0.00234,0.005815,0.219308
3,file_100,0.428788,0.000656,0.001231,0.001397,0.000473,0.997431,6.3e-05,0.000233,0.439945,0.006494,0.001355,0.007957,0.999935,0.818811,0.041106,0.004647,0.818868
4,file_1000,0.995154,0.000688,0.008054,0.006888,0.001364,0.993019,0.000556,0.005031,0.261818,0.045594,0.016118,0.023464,0.997665,0.979331,0.007864,0.008026,0.09331


In [33]:
mean_predictions_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,file_0,0.00788,0.000258,0.001856,0.012806,0.001243,0.978633,0.00083,0.000275,0.004255,0.001517,0.017801,0.002503,0.999501,0.007137,0.003741,0.000324,0.007457
1,file_1,0.020115,0.001063,0.003271,0.026395,0.007789,0.987504,0.001078,0.001102,0.014232,0.004781,0.002344,0.011588,0.998687,0.010648,0.019523,0.00207,0.020553
2,file_10,0.11247,0.00045,0.004199,0.000526,0.000178,0.004071,0.001011,0.00037,0.011547,0.002038,0.000553,0.992372,0.997895,0.007432,0.000933,0.001882,0.118545
3,file_100,0.365171,0.000328,0.001163,0.001057,0.000369,0.995261,4.6e-05,0.000182,0.354873,0.004199,0.000893,0.006364,0.999863,0.748615,0.02797,0.003295,0.637615
4,file_1000,0.988615,0.000314,0.00387,0.003703,0.000922,0.986524,0.000402,0.002526,0.209117,0.028993,0.010647,0.017201,0.995287,0.955292,0.004538,0.005714,0.058605


## Train xgboost trees to make better predictions

In [357]:
val_predictions_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
0,test_628,0.071995,0.01981,0.018135,0.02046,0.015155,0.950393,0.018868,0.034112,0.073635,...,0,0,0,0,0,1,0,0,0,0
1,test_8842,0.07047,0.024685,0.020838,0.020732,0.016295,0.946937,0.013844,0.024798,0.065312,...,0,0,0,0,0,1,0,0,0,0
2,test_11877,0.760127,0.092261,0.118277,0.09968,0.101711,0.741155,0.107806,0.115136,0.207027,...,0,0,1,0,0,1,1,0,0,0
3,test_2413,0.229211,0.058638,0.096851,0.11294,0.040866,0.83326,0.051784,0.093998,0.156023,...,0,0,0,0,0,1,0,0,0,1
4,test_28838,0.071773,0.017712,0.019611,0.027369,0.016949,0.962932,0.013631,0.027329,0.066544,...,0,0,0,0,0,1,0,0,0,0


In [26]:
import xgboost as xgb

#### A simple try of xgb

In [27]:
from sklearn.model_selection import KFold

In [28]:
n_folds = 3

In [31]:
kf = KFold(n_splits=n_folds)

trainval_x = val_predictions_df[unique_tags].values
trainval_y = val_predictions_df[target_tags].values
    
for train_index, test_index in kf.split(trainval_x):
    train_x, val_x = trainval_x[train_index], trainval_x[test_index]
    train_y, val_y = trainval_y[train_index], trainval_y[test_index]
    
#     print(train_x.shape, train_y.shape)
#     print(train_x[:5, :], train_y[:5])
#     print(val_x[:5, :], val_y[:5])
    break

In [18]:
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "error",
    "learning_rate": 0.01,
    "tree_method": 'exact',
    "n_estimators": 150,
    "max_depth": 3,
#     "subsample": subsample,
#     "colsample_bytree": colsample_bytree,
    "silent": False,    
}
num_boost_round = 2500
early_stopping_rounds = 100

In [19]:
tag_index = 2

In [21]:
sumpw = val_predictions_df[target_tags[tag_index]].sum()
sumnw = len(val_predictions_df[target_tags[tag_index]]) - sumpw    
scale_pos_weight = sumnw * 1.0 / sumpw

params['scale_pos_weight'] = scale_pos_weight

In [22]:
dtrain = xgb.DMatrix(train_x, train_y[:, tag_index])
dval = xgb.DMatrix(val_x, val_y[:, tag_index])

In [23]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [24]:
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

[0]	train-error:0.261858	eval-error:0.260665
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 100 rounds.
[1]	train-error:0.261858	eval-error:0.260665
[2]	train-error:0.261858	eval-error:0.260665
[3]	train-error:0.261858	eval-error:0.260665
[4]	train-error:0.261858	eval-error:0.260665
[5]	train-error:0.261858	eval-error:0.260665
[6]	train-error:0.261858	eval-error:0.260665
[7]	train-error:0.25634	eval-error:0.258247
[8]	train-error:0.261858	eval-error:0.260665
[9]	train-error:0.25634	eval-error:0.258247
[10]	train-error:0.25634	eval-error:0.258247
[11]	train-error:0.25634	eval-error:0.258247
[12]	train-error:0.25634	eval-error:0.258247
[13]	train-error:0.25634	eval-error:0.258247
[14]	train-error:0.25634	eval-error:0.258247
[15]	train-error:0.25634	eval-error:0.258247
[16]	train-error:0.25634	eval-error:0.258247
[17]	train-error:0.25634	eval-error:0.258247
[18]	train-error:0.25634	eval-error:0.258247
[

In [25]:
y_preds_ = gbm.predict(xgb.DMatrix(trainval_x), ntree_limit=gbm.best_iteration+1)

In [26]:
search_best_threshold(y_true[:, tag_index], y_preds_)

(0.54000000000000004, 0.25030916502542022)

In [28]:
search_best_threshold(y_true[:, tag_index], val_predictions_df[unique_tags].values[:, tag_index])

(0.080000000000000002, 0.19706884016377763)

In [30]:
thr1 = 0.540
thr2 = 0.08
st = 40
end = 80
print((y_preds_[st:end] > thr1).astype(np.uint8)) 
print((val_predictions_df[unique_tags].values[st:end, tag_index] > thr2).astype(np.uint8))
print(y_true[st:end, tag_index])

[0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 0]
[0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [315]:
score(y_true[:, tag_index], df[unique_tags[tag_index]].values > 0.09)

0.19568799780280144

Run CV with a random search of optimal parameters

In [68]:
import matplotlib.pylab as plt
%matplotlib inline

In [29]:
def generate_params(iter_num):
    if iter_num > 0:
        for z in range(iter_num):
            print("\n-- Iteration: {}".format(z))
            eta = np.random.uniform(0.05, 0.001)
            max_depth = np.random.randint(2, 6)
            subsample = np.random.uniform(0.5, 0.95)
            colsample_bytree = np.random.uniform(0.5, 0.95)
            yield eta, max_depth, subsample, colsample_bytree
    else:
        eta = 0.05
        max_depth = 3
        subsample = 0.8204967474962096
        colsample_bytree = 0.7089159774987868
        yield eta, max_depth, subsample, colsample_bytree

In [53]:
#eta_values = [0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]

eval_metric = 'logloss'


best_params_for_tag_index = {}
for tag_index in range(0, len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))    
    seed = 2017
    n_folds = 5
    dtrainval = xgb.DMatrix(trainval_x, label=trainval_y[:, tag_index], feature_names=unique_tags)

    best_params_for_tag_index[tag_index] = {
        'test-%s-mean' % eval_metric: 1e10,
        'params': None,
    }
    
    sumpw = val_predictions_df[target_tags[tag_index]].sum()
    sumnw = len(val_predictions_df[target_tags[tag_index]]) - sumpw    
    scale_pos_weight = sumnw * 1.0 / sumpw
    
    iter_num = 15
#     iter_num = -1   
    gen = generate_params(iter_num)
    z = 0
    for (eta, max_depth, subsample, colsample_bytree) in gen:
        z += 1
        seed += z-1
        print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth,subsample,colsample_bytree))
        params = {
            "objective": "binary:logistic",
            "booster": "gbtree",
            "eval_metric": eval_metric
            ,
            "eta": eta,
            "tree_method": 'exact',
            "max_depth": max_depth,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "silent": 1,
            "seed": seed, 
            "scale_pos_weight": scale_pos_weight,
        }
        num_boost_round = 2000
        early_stopping_rounds = 100

        cvresult = xgb.cv(params, dtrain=dtrainval,
                           seed=params['seed'], 
                           num_boost_round=num_boost_round, 
                           early_stopping_rounds=early_stopping_rounds, nfold=n_folds, verbose_eval=False)
                
        min_test_logloss_mean = cvresult['test-%s-mean' % params['eval_metric']].min()
        if best_params_for_tag_index[tag_index]['test-%s-mean' % params['eval_metric']] > min_test_logloss_mean:
            best_params_for_tag_index[tag_index]['test-%s-mean' % params['eval_metric']] = min_test_logloss_mean
            best_params_for_tag_index[tag_index]['params'] = params
            print("Best cv result: ", cvresult.loc[cvresult.index[-1], :])
            print("Best params: ", params)
            
        


----------------
 Tag index: 0

-- Iteration: 0
XGBoost params. ETA: 0.023096741976626596, MAX_DEPTH: 2, SUBSAMPLE: 0.9294778786611162, COLSAMPLE_BY_TREE: 0.684612705671509
Best cv result:  test-logloss-mean     0.246641
test-logloss-std      0.009194
train-logloss-mean    0.233773
train-logloss-std     0.002104
Name: 1209, dtype: float64
Best params:  {'seed': 2017, 'scale_pos_weight': 2.2825856875356183, 'booster': 'gbtree', 'subsample': 0.9294778786611162, 'eta': 0.023096741976626596, 'silent': 1, 'objective': 'binary:logistic', 'colsample_bytree': 0.684612705671509, 'tree_method': 'exact', 'eval_metric': 'logloss', 'max_depth': 2}

-- Iteration: 1
XGBoost params. ETA: 0.023096741976626596, MAX_DEPTH: 2, SUBSAMPLE: 0.9294778786611162, COLSAMPLE_BY_TREE: 0.684612705671509

-- Iteration: 2
XGBoost params. ETA: 0.020594509346487103, MAX_DEPTH: 3, SUBSAMPLE: 0.8326168479972074, COLSAMPLE_BY_TREE: 0.8939410323228981
Best cv result:  test-logloss-mean     0.245555
test-logloss-std      0

KeyboardInterrupt: 

In [75]:
import pickle 
now = datetime.now()

xgb_best_params_filepath = os.path.join(GENERATED_DATA, 'xgb_best_params_%s.pkl' % str(now.strftime("%Y-%m-%d-%H-%M")))

with open(xgb_best_params_filepath, 'wb') as handle:
    pickle.dump(best_params_for_tag_index, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [36]:
best_params_for_tag_index = _best_params_for_tag_index

Train 17 binary classifiers

In [37]:
from sklearn.model_selection import train_test_split

for tag_index in range(len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))
    train_x, val_x, train_y, val_y = train_test_split(trainval_x, trainval_y, train_size=0.85)
    dtrain = xgb.DMatrix(train_x, train_y[:, tag_index])
    dval = xgb.DMatrix(val_x, val_y[:, tag_index])
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    params = best_params_for_tag_index[tag_index]['params']
    num_boost_round = 2500    
    early_stopping_rounds = 12

    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)                
    print("Best score: ", gbm.best_score)
    best_params_for_tag_index[tag_index]['gbm'] = gbm



----------------
 Tag index: 0
Best score:  0.108135

----------------
 Tag index: 1
Best score:  0.010417

----------------
 Tag index: 2
Best score:  0.166832

----------------
 Tag index: 3
Best score:  0.08168

----------------
 Tag index: 4
Best score:  0.062996

----------------
 Tag index: 5
Best score:  0.053406

----------------
 Tag index: 6
Best score:  0.022817

----------------
 Tag index: 7
Best score:  0.034226

----------------
 Tag index: 8
Best score:  0.205192

----------------
 Tag index: 9
Best score:  0.095734

----------------
 Tag index: 10
Best score:  0.084821

----------------
 Tag index: 11
Best score:  0.03588

----------------
 Tag index: 12
Best score:  0.058036

----------------
 Tag index: 13
Best score:  0.092097

----------------
 Tag index: 14
Best score:  0.076058

----------------
 Tag index: 15
Best score:  0.168485

----------------
 Tag index: 16
Best score:  0.100198


Compute best thresholds

In [38]:
best_thresholds = {}

for tag_index, tag in enumerate(unique_tags):
        
    dmat = xgb.DMatrix(trainval_x)
    gbm = best_params_for_tag_index[tag_index]['gbm']
    y_preds_ = gbm.predict(dmat, ntree_limit=gbm.best_iteration+1)        
    
    best_thresholds[tag], best_score = search_best_threshold(y_true[:, tag_index], y_preds_)
    print("%s | best threshold : %f with score: %f" % (tag, best_thresholds[tag], best_score))


agriculture | best threshold : 0.490000 with score: 0.893838
artisinal_mine | best threshold : 0.540000 with score: 0.800101
bare_ground | best threshold : 0.650000 with score: 0.481097
blooming | best threshold : 0.580000 with score: 0.348643
blow_down | best threshold : 0.530000 with score: 0.152452
clear | best threshold : 0.220000 with score: 0.979285
cloudy | best threshold : 0.630000 with score: 0.916326
conventional_mine | best threshold : 0.540000 with score: 0.325991
cultivation | best threshold : 0.520000 with score: 0.688206
habitation | best threshold : 0.520000 with score: 0.780279
haze | best threshold : 0.530000 with score: 0.778034
partly_cloudy | best threshold : 0.500000 with score: 0.949168
primary | best threshold : 0.250000 with score: 0.990040
road | best threshold : 0.500000 with score: 0.861935
selective_logging | best threshold : 0.710000 with score: 0.462641
slash_burn | best threshold : 0.540000 with score: 0.221661
water | best threshold : 0.450000 with scor

Boost test predictions

In [39]:
predictions_csv = glob(os.path.join(OUTPUT_PATH, "predictions_*2017-07-13-19-21.csv"))

prediction_df = pd.read_csv(predictions_csv[0]).dropna()
for filepath in predictions_csv[1:]:
    prediction_df = pd.concat([prediction_df, pd.read_csv(filepath).dropna()])
prediction_df.reset_index(inplace=True)   
prediction_df.drop('index', axis=1, inplace=True)

In [40]:
prediction_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,test_40308,0.980499,5.34357e-06,0.012761,1.083201e-06,1.659149e-05,0.026706,6.184839e-06,0.0001213845,0.247138,0.230338,0.00278,0.955976,0.994307,0.5091,6.428061e-05,0.005902686,0.049497
1,test_36168,0.008954,1.964079e-09,0.000126,0.0004806151,5.747763e-05,0.990458,1.74133e-08,7.339324e-10,0.002185,0.001168,0.00654,0.000672,0.999985,0.002998,0.0002039154,6.675383e-07,0.00499
2,test_6070,0.99853,1.298099e-10,0.004025,1.012986e-10,1.42529e-09,0.999737,2.468224e-13,2.351865e-06,0.095436,0.819567,0.000283,3.6e-05,0.947903,0.991666,2.751586e-07,0.0002056249,0.032477
3,test_5483,0.003073,3.214428e-09,5.3e-05,0.03451585,0.001510115,0.997953,1.104697e-08,2.678857e-10,0.001081,0.000312,0.000302,0.001118,0.999984,0.001043,0.003596786,6.073845e-07,0.00227
4,test_5532,0.004007,6.949826e-08,0.000144,0.1529595,0.006396817,0.998306,3.806774e-08,3.966572e-09,0.001861,0.000672,0.000304,0.000979,0.999972,0.002848,0.02929822,2.036211e-06,0.003847


In [41]:
y_preds_init = prediction_df[unique_tags].values
y_preds = np.zeros_like(y_preds_init)

for tag_index in range(len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))
    dtest = xgb.DMatrix(y_preds_init)
    gbm = best_params_for_tag_index[tag_index]['gbm']
    y_preds_ = gbm.predict(dtest, ntree_limit=gbm.best_iteration+1)        
    y_preds[:, tag_index] = y_preds_



----------------
 Tag index: 0

----------------
 Tag index: 1

----------------
 Tag index: 2

----------------
 Tag index: 3

----------------
 Tag index: 4

----------------
 Tag index: 5

----------------
 Tag index: 6

----------------
 Tag index: 7

----------------
 Tag index: 8

----------------
 Tag index: 9

----------------
 Tag index: 10

----------------
 Tag index: 11

----------------
 Tag index: 12

----------------
 Tag index: 13

----------------
 Tag index: 14

----------------
 Tag index: 15

----------------
 Tag index: 16


In [42]:
df = prediction_df.copy()
df[unique_tags] = y_preds

In [43]:
df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,test_40308,0.522231,0.45251,0.252149,0.389114,0.455467,0.16127,0.123053,0.455544,0.522468,0.544217,0.44323,0.866382,0.750729,0.503381,0.268825,0.499394,0.248423
1,test_36168,0.47712,0.45251,0.15965,0.389114,0.455467,0.807806,0.122661,0.455544,0.391684,0.410128,0.443108,0.1369,0.729628,0.494106,0.260546,0.451676,0.141758
2,test_6070,0.522231,0.45251,0.222007,0.389114,0.455467,0.83,0.13256,0.455544,0.527279,0.583079,0.440586,0.190396,0.609088,0.505878,0.259275,0.451676,0.23831
3,test_5483,0.47712,0.45251,0.158208,0.533957,0.478718,0.837238,0.122661,0.455544,0.39145,0.412522,0.43547,0.134498,0.76162,0.494106,0.32732,0.451676,0.127105
4,test_5532,0.47712,0.45251,0.16117,0.601405,0.455475,0.837238,0.122661,0.455544,0.404547,0.414466,0.43547,0.135214,0.76162,0.494106,0.620745,0.451676,0.132539


In [44]:
def compute_mean(df):
    gb = df.groupby('image_name')
    df2 = gb.agg(np.mean).reset_index()
    return df2

In [45]:
mean_df = compute_mean(df)

In [46]:
len(mean_df), len(df)

(61191, 183573)

In [47]:
create_submission(mean_df, info="squeezenet21_blended_3_folds", thresholds=best_thresholds)

## Storages

Last best

In [35]:
_best_params_for_tag_index = {
0: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9097441644531741,
'eta': 0.012063976013602831,
'eval_metric': 'error',
'max_depth': 3,
'objective': 'binary:logistic',
'scale_pos_weight': 2.2953291651342407,
'seed': 2027,
'silent': 1,
'subsample': 0.8135263419352342,
'tree_method': 'exact'},
'test-error-mean': 0.18432560000000001},
1: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9247718213959808,
'eta': 0.048774868242772856,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 119.17883755588673,
'seed': 2045,
'silent': 1,
'subsample': 0.8232135887360947,
'tree_method': 'exact'},
'test-error-mean': 0.0096606000000000018},
2: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 45.478386167146972,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.035527999999999997},
3: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 117.76288659793815,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.0097225999999999996},
4: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 404.22613065326635,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.00248},
5: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 0.42622167984294584,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.13793419999999998},
6: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 18.122599004031301,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.032638800000000003},
7: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 383.0,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.0027032000000000002},
8: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 8.2033782241497377,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.22521079999999999},
9: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 10.04960263085777,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.1460564},
10: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 13.936099277644008,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.065190800000000007},
11: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 4.569060773480663,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.1443574},
12: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 0.080400326906844941,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.065996799999999994},
13: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9097441644531741,
'eta': 0.012063976013602831,
'eval_metric': 'error',
'max_depth': 3,
'objective': 'binary:logistic',
'scale_pos_weight': 4.051049170059505,
'seed': 2027,
'silent': 1,
'subsample': 0.8135263419352342,
'tree_method': 'exact'},
'test-error-mean': 0.17645080000000002},
14: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9247718213959808,
'eta': 0.048774868242772856,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 124.21739130434783,
'seed': 2045,
'silent': 1,
'subsample': 0.8232135887360947,
'tree_method': 'exact'},
'test-error-mean': 0.0087554},
15: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 207.9119170984456,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.0049351999999999998},
16: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 4.4593460158418523,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.25375720000000002}}

In [122]:
_best_params_for_tag_index = {
0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38209179999999998},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.036067200000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.6334399849809398,
   'eta': 0.15055304521905347,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2019,
   'silent': 1,
   'subsample': 0.6186701247268119,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.086130599999999988},
 3: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.041870600000000001},
 4: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2017,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.015638200000000001},
 5: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.30505919999999997},
 6: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.068178600000000006},
 7: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2017,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.013706000000000001},
 8: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.8579978891053311,
   'eta': 0.18574545610881862,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2044,
   'silent': 1,
   'subsample': 0.8611335020320647,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.29456899999999997},
 9: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.20715659999999997},
 10: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.1200102},
 11: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.29197859999999998},
 12: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.1244864},
 13: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.3228318},
 14: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.8398837534294123,
   'eta': 0.14522170765829945,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2022,
   'silent': 1,
   'subsample': 0.8734804475952236,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.041907600000000003},
 15: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.586741114680066,
   'eta': 0.08503519422605446,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2052,
   'silent': 1,
   'subsample': 0.9395104239451562,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.027125},
 16: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38378999999999996}
}

{0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38209179999999998},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.036067200000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.6334399849809398,
   'eta': 0.15055304521905347,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2019,
   'silent': 1,
   'subsample': 0.6186701247268119,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.0861305999999

Last saved

In [48]:
best_params_for_tag_index = {0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5829334344789511,
   'eta': 0.024988972967877485,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 2.2825856875356183,
   'seed': 2038,
   'silent': 1,
   'subsample': 0.7387515922133331,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.24204939999999997},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5151974580887335,
   'eta': 0.02653115436725616,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 114.2,
   'seed': 2023,
   'silent': 1,
   'subsample': 0.8162897671482163,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.017903800000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.9152266627015204,
   'eta': 0.03617273607658269,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 43.849833147942157,
   'seed': 2122,
   'silent': 1,
   'subsample': 0.6340682509577085,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.095588800000000002},
 3: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5151974580887335,
   'eta': 0.02653115436725616,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 124.21739130434783,
   'seed': 2023,
   'silent': 1,
   'subsample': 0.8162897671482163,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.042751600000000001}
}