# Predictions blender

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from glob import glob
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

import shutil

# Project
project_common_path = os.path.dirname('.')
project_common_path = os.path.abspath(os.path.join(project_common_path, '..', 'common'))
if not project_common_path in sys.path:
    sys.path.append(project_common_path)

In [3]:
os.environ['THEANO_FLAGS'] = 'device=cpu'

import numpy as np
import pandas as pd

from data_utils import get_id_type_list_for_class, GENERATED_DATA, OUTPUT_PATH
from test_utils import create_submission

Using Theano backend.


### Load predictions on trainval dataset

In [5]:
from data_utils import unique_tags, get_label

target_tags = ['target_' + t for t in unique_tags]
val_predictions_resnet_filepath = os.path.join(GENERATED_DATA, "val_predictions_ResNet50_2017-07-20-00-09.csv")
val_predictions_squeezenet_filepath = os.path.join(GENERATED_DATA, "val_predictions_SqueezeNet21_2017-07-19-23-38.csv")

val_predictions_vgg19_filepath = os.path.join(GENERATED_DATA, "val_predictions_deep_model_vgg19_train_LB092919_prob.csv")
val_predictions_weirdcnn_filepath = os.path.join(GENERATED_DATA, "val_predictions_deep_model_train_LB092655_prob.csv")

def get_val_predictions_df(val_predictions_filepath, search_prefix="val_predictions_*.csv"):
    if not os.path.exists(val_predictions_filepath):
        val_predictions_csv = glob(os.path.join(OUTPUT_PATH, search_prefix))
        df = pd.read_csv(val_predictions_csv[0]).dropna()
        for filepath in val_predictions_csv[1:]:
            df = pd.concat([df, pd.read_csv(filepath).dropna()])
        df.reset_index(inplace=True)   
        df.drop('index', axis=1, inplace=True)
        df['image_id'] = df['image_name'].apply(lambda x: int(x[len('train_'):]))    
        for t in target_tags:
            df[t] = ''
        def fill_target_tags(row):
            image_id = row[0]
            labels = get_label(image_id, "Train_jpg")
            row[1:] = labels    
            return row
        cols = ['image_id', ] + target_tags
        df[cols] = df[cols].apply(fill_target_tags, axis=1)

        df.to_csv(val_predictions_filepath, index=False)
        val_predictions_df = df
        df = None    
    else:
        val_predictions_df = pd.read_csv(val_predictions_filepath)
    return val_predictions_df


val_predictions_resnet_df = get_val_predictions_df(val_predictions_resnet_filepath, "val_predictions_ResNet50*_2017-07-20-00-09.csv")
val_predictions_squeezenet_df = get_val_predictions_df(val_predictions_squeezenet_filepath, "val_predictions_SqueezeNet21*_2017-07-19-23-38.csv")
val_predictions_vgg19_df = get_val_predictions_df(val_predictions_vgg19_filepath, "vgg19/deep_model_vgg19_train_LB092919_prob.csv")
val_predictions_weirdcnn_df = get_val_predictions_df(val_predictions_weirdcnn_filepath, "custom_weird_model/deep_model_train_LB092655_prob.csv")

# val_predictions_vgg19_df = pd.read_csv(os.path.join(OUTPUT_PATH, "vgg19", "deep_model_vgg19_train_LB092919_prob.csv"))
# val_predictions_weirdcnn_df = pd.read_csv(os.path.join(OUTPUT_PATH, "custom_weird_model", "deep_model_train_LB092655_prob.csv"))

In [11]:
len(val_predictions_resnet_df['image_name'].unique())

40448

In [6]:
print(len(val_predictions_resnet_df))
val_predictions_resnet_df.sort_values(by=['image_name']).head()

40448


Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
832,train_0,0.008284,4.1e-05,0.000914,0.003209,0.000309,0.703545,0.000538,6.1e-05,0.004989,...,0,0,0,1,0,1,0,0,0,0
19884,train_1,0.737512,0.006013,0.007813,0.000397,0.000697,0.995429,8.9e-05,0.000155,0.123096,...,0,0,0,0,0,1,0,0,0,1
26039,train_10,0.903812,0.001347,0.122508,0.001115,0.001388,0.986162,0.000764,0.002425,0.402746,...,0,0,0,0,0,1,0,0,1,1
17955,train_100,0.177406,0.000277,0.009606,8.5e-05,1.2e-05,0.868908,0.00381,3.9e-05,0.097001,...,0,0,0,0,0,0,0,0,0,1
1344,train_1000,0.135083,0.000307,0.001668,0.000686,0.000394,4.2e-05,0.000546,0.000747,0.037258,...,0,0,0,0,1,1,0,0,0,0


In [12]:
len(val_predictions_squeezenet_df['image_name'].unique())

40320

In [13]:
print(len(val_predictions_squeezenet_df))
val_predictions_squeezenet_df.sort_values(by=['image_name']).head()

40320


Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
96,train_0,0.018483,1.909171e-08,0.000393,0.000732,0.000133,0.747042,2.419611e-05,3.019786e-08,0.004758,...,0,0,0,1,0,1,0,0,0,0
24728,train_1,0.736668,0.000649497,0.022689,0.000143,8.9e-05,0.999707,5.364234e-09,0.0003617941,0.157969,...,0,0,0,0,0,1,0,0,0,1
28474,train_10,0.566284,0.01143873,0.063701,0.000837,0.000484,0.983316,4.179635e-06,0.001101662,0.271738,...,0,0,0,0,0,1,0,0,1,1
24443,train_100,0.212349,0.0008925403,0.037281,0.000136,1.2e-05,0.945379,0.0004714176,0.0005605649,0.028581,...,0,0,0,0,0,0,0,0,0,1
3375,train_1000,0.031265,1.494139e-07,0.000126,0.000153,0.000327,0.000316,0.0001080615,2.777303e-08,0.006959,...,0,0,0,0,1,1,0,0,0,0


In [8]:
print(len(val_predictions_vgg19_df))
val_predictions_vgg19_df[['image_name',] + unique_tags + target_tags].head()

40479


Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
0,train_0,0.003752,1.428458e-09,8.8e-05,0.00023,1.8e-05,0.581344,2.108085e-05,2.15084e-09,0.00146,...,0,0,0,1,0,1,0,0,0,0
1,train_1,0.799365,0.0001561381,0.00974,0.000171,0.000109,0.998879,1.573375e-06,0.0001418633,0.128822,...,0,0,0,0,0,1,0,0,0,1
2,train_2,0.001674,1.083242e-09,1.8e-05,0.000292,0.000128,0.999015,1.269098e-06,8.769292e-11,0.000902,...,0,0,0,0,0,1,0,0,0,0
3,train_3,0.005073,3.921103e-08,8.6e-05,0.006642,0.000504,0.998185,2.577778e-06,3.010564e-09,0.002385,...,0,0,0,0,0,1,0,0,0,0
4,train_4,0.879332,0.001331866,0.017324,0.001006,0.000401,0.998223,4.311219e-07,0.0003768842,0.343372,...,0,0,1,0,0,1,1,0,0,0


In [9]:
print(len(val_predictions_weirdcnn_df))
val_predictions_weirdcnn_df[['image_name',] + unique_tags + target_tags].head()

40479


Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
0,train_0,0.001094,1.497006e-10,2.8e-05,0.000123,7e-06,0.472721,5.882897e-07,2.400205e-09,0.000511,...,0,0,0,1,0,1,0,0,0,0
1,train_1,0.896112,0.0001672402,0.007995,0.000113,0.000506,0.999508,3.187587e-07,4.936486e-06,0.137175,...,0,0,0,0,0,1,0,0,0,1
2,train_2,0.001006,6.569336e-13,2.3e-05,1.9e-05,1.4e-05,0.99962,3.683754e-08,8.408717e-11,0.000468,...,0,0,0,0,0,1,0,0,0,0
3,train_3,0.010779,1.178159e-08,0.000234,0.014649,0.000258,0.998615,9.127531e-08,1.763152e-07,0.005485,...,0,0,0,0,0,1,0,0,0,0
4,train_4,0.796989,0.0003531893,0.085411,0.000178,0.000493,0.998534,3.18112e-08,0.0001675168,0.160135,...,0,0,1,0,0,1,1,0,0,0


In [14]:

def get_optimal_thresholds(y_true, y_preds):
    best_thresholds = [0.0]*len(unique_tags)    
    best_score = 0
    thrs = np.arange(0.0, 1.0, 0.01)    
    for i, tag in enumerate(unique_tags):
        print("%s : best_score=" % tag, end="")
        thresholds = list(best_thresholds)
        for thr in thrs:            
            thresholds[i] = thr
            s = score(y_true, y_preds > thresholds)
            if s > best_score:
                best_score = s
                best_thresholds[i] = thr
        print("%f, best_threshold=%f" % (best_score, best_thresholds[i]))
    return best_thresholds, best_score

### Compute f2 on validation with default thresholds

In [15]:
from metrics import score

In [18]:
y_true = val_predictions_squeezenet_df[target_tags].values
y_preds = (val_predictions_squeezenet_df[unique_tags].values > 0.5).astype(np.uint8)
score(y_true, y_preds)

0.90056961124724

In [19]:
y_true = val_predictions_resnet_df[target_tags].values
y_preds = (val_predictions_resnet_df[unique_tags].values > 0.5).astype(np.uint8)
score(y_true, y_preds)

0.89219838234177973

In [20]:
y_true = val_predictions_squeezenet_df[target_tags].values
y_preds = val_predictions_squeezenet_df[unique_tags].values

optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)
print(optimal_thresholds, best_score)

agriculture : best_score=0.491415, best_threshold=0.130000
artisinal_mine : best_score=0.509142, best_threshold=0.060000
bare_ground : best_score=0.526236, best_threshold=0.100000
blooming : best_score=0.545969, best_threshold=0.120000
blow_down : best_score=0.568343, best_threshold=0.220000
clear : best_score=0.573190, best_threshold=0.140000
cloudy : best_score=0.596251, best_threshold=0.050000
conventional_mine : best_score=0.623577, best_threshold=0.070000
cultivation : best_score=0.644170, best_threshold=0.210000
habitation : best_score=0.671157, best_threshold=0.150000
haze : best_score=0.701578, best_threshold=0.110000
partly_cloudy : best_score=0.731755, best_threshold=0.130000
primary : best_score=0.733095, best_threshold=0.240000
road : best_score=0.765562, best_threshold=0.220000
selective_logging : best_score=0.814677, best_threshold=0.130000
slash_burn : best_score=0.872333, best_threshold=0.200000
water : best_score=0.923366, best_threshold=0.220000
[0.13, 0.0599999999999

In [21]:
y_true = val_predictions_resnet_df[target_tags].values
y_preds = val_predictions_resnet_df[unique_tags].values

optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)
print(optimal_thresholds, best_score)

agriculture : best_score=0.491154, best_threshold=0.090000
artisinal_mine : best_score=0.508878, best_threshold=0.120000
bare_ground : best_score=0.525933, best_threshold=0.090000
blooming : best_score=0.545513, best_threshold=0.060000
blow_down : best_score=0.567846, best_threshold=0.040000
clear : best_score=0.572521, best_threshold=0.090000
cloudy : best_score=0.595493, best_threshold=0.080000
conventional_mine : best_score=0.622763, best_threshold=0.080000
cultivation : best_score=0.642889, best_threshold=0.170000
habitation : best_score=0.669411, best_threshold=0.160000
haze : best_score=0.699623, best_threshold=0.130000
partly_cloudy : best_score=0.729248, best_threshold=0.110000
primary : best_score=0.730643, best_threshold=0.170000
road : best_score=0.762386, best_threshold=0.170000
selective_logging : best_score=0.811153, best_threshold=0.140000
slash_burn : best_score=0.868476, best_threshold=0.270000
water : best_score=0.918699, best_threshold=0.200000
[0.089999999999999997,

In [22]:
y_true = val_predictions_weirdcnn_df[target_tags].values
y_preds = val_predictions_weirdcnn_df[unique_tags].values

optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)
print(optimal_thresholds, best_score)

agriculture : best_score=0.491805, best_threshold=0.130000
artisinal_mine : best_score=0.509645, best_threshold=0.100000
bare_ground : best_score=0.527131, best_threshold=0.130000
blooming : best_score=0.547083, best_threshold=0.120000
blow_down : best_score=0.569628, best_threshold=0.100000
clear : best_score=0.574745, best_threshold=0.150000
cloudy : best_score=0.598004, best_threshold=0.110000
conventional_mine : best_score=0.625563, best_threshold=0.140000
cultivation : best_score=0.647085, best_threshold=0.160000
habitation : best_score=0.674879, best_threshold=0.190000
haze : best_score=0.705928, best_threshold=0.150000
partly_cloudy : best_score=0.736810, best_threshold=0.190000
primary : best_score=0.738335, best_threshold=0.200000
road : best_score=0.771630, best_threshold=0.210000
selective_logging : best_score=0.821443, best_threshold=0.210000
slash_burn : best_score=0.879773, best_threshold=0.150000
water : best_score=0.931936, best_threshold=0.210000
[0.13, 0.1000000000000

In [23]:
y_true = val_predictions_vgg19_df[target_tags].values
y_preds = val_predictions_vgg19_df[unique_tags].values

optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)
print(optimal_thresholds, best_score)

agriculture : best_score=0.492076, best_threshold=0.140000
artisinal_mine : best_score=0.509976, best_threshold=0.180000
bare_ground : best_score=0.527524, best_threshold=0.110000
blooming : best_score=0.547497, best_threshold=0.110000
blow_down : best_score=0.570068, best_threshold=0.090000
clear : best_score=0.575244, best_threshold=0.140000
cloudy : best_score=0.598613, best_threshold=0.090000
conventional_mine : best_score=0.626214, best_threshold=0.100000
cultivation : best_score=0.647910, best_threshold=0.170000
habitation : best_score=0.675766, best_threshold=0.130000
haze : best_score=0.706881, best_threshold=0.120000
partly_cloudy : best_score=0.737910, best_threshold=0.190000
primary : best_score=0.739460, best_threshold=0.190000
road : best_score=0.773407, best_threshold=0.180000
selective_logging : best_score=0.823380, best_threshold=0.130000
slash_burn : best_score=0.881838, best_threshold=0.140000
water : best_score=0.935266, best_threshold=0.180000
[0.14000000000000001, 

## Test merges: mean, median, max

In [24]:
def compute_mean(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.mean).reset_index()
    return df2

def compute_median(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.median).reset_index()
    return df2


def compute_max(predictions):
    df = predictions[0]
    for p in predictions[1:]:
        df = pd.concat([df, p], axis=0)
    df = df.apply(pd.to_numeric, errors='ignore')
    gb = df.groupby('image_name')
    df2 = gb.agg(np.max).reset_index()
    return df2

# from scipy.stats import gmean

# def compute_gmean(predictions):
#     df = predictions[0]
#     for p in predictions[1:]:
#         df = pd.concat([df, p], axis=0)
#     df = df.apply(pd.to_numeric, errors='ignore')
#     gb = df.groupby('image_name')
#     df2 = gb.agg(gmean).reset_index()
#     return df2


In [38]:
val_predictions = [val_predictions_resnet_df, val_predictions_squeezenet_df, val_predictions_vgg19_df, val_predictions_weirdcnn_df]
# val_predictions = [val_predictions_vgg19_df, val_predictions_weirdcnn_df]
 

In [39]:
# methods = [compute_mean, compute_median, compute_max, compute_gmean] 
methods = [compute_mean, compute_median, compute_max, ]  

In [26]:
for m in methods:
    df = m(val_predictions)
    y_true = df[target_tags].values
    y_preds = (df[unique_tags].values > 0.35).astype(np.uint8)
    print(m, score(y_true, y_preds))

<function compute_mean at 0x00000000107D8488> 0.929983637127
<function compute_median at 0x00000000107D8BF8> 0.929983637127
<function compute_max at 0x00000000107D8B70> 0.933362194498


### Search thresholds

In [27]:

for m in methods:
    print("\nMethod : ", m)
    df = m(val_predictions)
    y_true = df[target_tags].values
    y_preds = df[unique_tags].values
    optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)    
    print("-----")
    print(optimal_thresholds, best_score)



Method :  <function compute_mean at 0x00000000107D8488>
agriculture : best_score=0.492010, best_threshold=0.120000
artisinal_mine : best_score=0.509889, best_threshold=0.210000
bare_ground : best_score=0.527463, best_threshold=0.120000
blooming : best_score=0.547441, best_threshold=0.110000
blow_down : best_score=0.570007, best_threshold=0.100000
clear : best_score=0.575199, best_threshold=0.140000
cloudy : best_score=0.598535, best_threshold=0.100000
conventional_mine : best_score=0.626111, best_threshold=0.250000
cultivation : best_score=0.647873, best_threshold=0.170000
habitation : best_score=0.675814, best_threshold=0.220000
haze : best_score=0.706986, best_threshold=0.130000
partly_cloudy : best_score=0.738044, best_threshold=0.220000
primary : best_score=0.739580, best_threshold=0.210000
road : best_score=0.773432, best_threshold=0.210000
selective_logging : best_score=0.823410, best_threshold=0.160000
slash_burn : best_score=0.881879, best_threshold=0.150000
water : best_score

## Best score with best thresholds is with mean

In [28]:
df = compute_mean(val_predictions)
y_true = df[target_tags].values
y_preds = df[unique_tags].values
optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)    
print(optimal_thresholds, best_score)


agriculture : best_score=0.492010, best_threshold=0.120000
artisinal_mine : best_score=0.509889, best_threshold=0.210000
bare_ground : best_score=0.527463, best_threshold=0.120000
blooming : best_score=0.547441, best_threshold=0.110000
blow_down : best_score=0.570007, best_threshold=0.100000
clear : best_score=0.575199, best_threshold=0.140000
cloudy : best_score=0.598535, best_threshold=0.100000
conventional_mine : best_score=0.626111, best_threshold=0.250000
cultivation : best_score=0.647873, best_threshold=0.170000
habitation : best_score=0.675814, best_threshold=0.220000
haze : best_score=0.706986, best_threshold=0.130000
partly_cloudy : best_score=0.738044, best_threshold=0.220000
primary : best_score=0.739580, best_threshold=0.210000
road : best_score=0.773432, best_threshold=0.210000
selective_logging : best_score=0.823410, best_threshold=0.160000
slash_burn : best_score=0.881879, best_threshold=0.150000
water : best_score=0.935187, best_threshold=0.220000
[0.12, 0.2099999999999

In [29]:
optimal_thresholds

[0.12,
 0.20999999999999999,
 0.12,
 0.11,
 0.10000000000000001,
 0.14000000000000001,
 0.10000000000000001,
 0.25,
 0.17000000000000001,
 0.22,
 0.13,
 0.22,
 0.20999999999999999,
 0.20999999999999999,
 0.16,
 0.14999999999999999,
 0.22]

## Score with best thresholds is with max

In [29]:
df = compute_max(val_predictions)
y_true = df[target_tags].values
y_preds = df[unique_tags].values
optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)    
print(optimal_thresholds, best_score)


agriculture : best_score=0.491989, best_threshold=0.140000
artisinal_mine : best_score=0.509859, best_threshold=0.220000
bare_ground : best_score=0.527397, best_threshold=0.140000
blooming : best_score=0.547355, best_threshold=0.130000
blow_down : best_score=0.569904, best_threshold=0.190000
clear : best_score=0.575092, best_threshold=0.160000
cloudy : best_score=0.598405, best_threshold=0.180000
conventional_mine : best_score=0.625963, best_threshold=0.160000
cultivation : best_score=0.647619, best_threshold=0.200000
habitation : best_score=0.675525, best_threshold=0.260000
haze : best_score=0.706618, best_threshold=0.180000
partly_cloudy : best_score=0.737622, best_threshold=0.210000
primary : best_score=0.739171, best_threshold=0.220000
road : best_score=0.772901, best_threshold=0.230000
selective_logging : best_score=0.822757, best_threshold=0.200000
slash_burn : best_score=0.881106, best_threshold=0.200000
water : best_score=0.934306, best_threshold=0.320000
[0.14000000000000001, 

In [30]:
optimal_thresholds

[0.14000000000000001,
 0.22,
 0.14000000000000001,
 0.13,
 0.19,
 0.16,
 0.17999999999999999,
 0.16,
 0.20000000000000001,
 0.26000000000000001,
 0.17999999999999999,
 0.20999999999999999,
 0.22,
 0.23000000000000001,
 0.20000000000000001,
 0.20000000000000001,
 0.32000000000000001]

Median


In [40]:
df = compute_median(val_predictions)
y_true = df[target_tags].values
y_preds = df[unique_tags].values
optimal_thresholds, best_score = get_optimal_thresholds(y_true, y_preds)    
print(optimal_thresholds, best_score)


agriculture : best_score=0.491900, best_threshold=0.110000
artisinal_mine : best_score=0.509758, best_threshold=0.120000
bare_ground : best_score=0.527241, best_threshold=0.090000
blooming : best_score=0.547177, best_threshold=0.100000
blow_down : best_score=0.569721, best_threshold=0.050000
clear : best_score=0.574864, best_threshold=0.130000
cloudy : best_score=0.598202, best_threshold=0.100000
conventional_mine : best_score=0.625758, best_threshold=0.070000
cultivation : best_score=0.647307, best_threshold=0.170000
habitation : best_score=0.675114, best_threshold=0.190000
haze : best_score=0.706175, best_threshold=0.110000
partly_cloudy : best_score=0.737164, best_threshold=0.160000
primary : best_score=0.738699, best_threshold=0.210000
road : best_score=0.772358, best_threshold=0.210000
selective_logging : best_score=0.822162, best_threshold=0.130000
slash_burn : best_score=0.880425, best_threshold=0.110000
water : best_score=0.933371, best_threshold=0.190000
[0.11, 0.12, 0.0899999

In [41]:
optimal_thresholds

[0.11,
 0.12,
 0.089999999999999997,
 0.10000000000000001,
 0.050000000000000003,
 0.13,
 0.10000000000000001,
 0.070000000000000007,
 0.17000000000000001,
 0.19,
 0.11,
 0.16,
 0.20999999999999999,
 0.20999999999999999,
 0.13,
 0.11,
 0.19]


## Compute test dataset probabilities with merging and best thresholds

In [42]:
predictions_csv1 = glob(os.path.join(OUTPUT_PATH, "predictions_*2017-07-18-09-28.csv"))
predictions_csv2 = glob(os.path.join(OUTPUT_PATH, "predictions_*2017-07-13-19-21.csv"))
predictions_csv = []
predictions_csv = predictions_csv + predictions_csv1 + predictions_csv2
predictions_csv.append(os.path.join(OUTPUT_PATH, "custom_weird_model", "deep_model_test_LB092655_prob.csv"))
predictions_csv.append(os.path.join(OUTPUT_PATH, "vgg19", "deep_model_vgg19_test_LB092919_prob.csv"))

predictions_csv

['/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_1_ResNet50_all_classes_fold=0_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_2_ResNet50_all_classes_fold=1_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_3_ResNet50_all_classes_fold=2_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_4_ResNet50_all_classes_fold=3_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_5_ResNet50_all_classes_fold=4_seed=2017_2017-07-18-09-28.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/predictions_1_1_SqueezeNet21_all_classes_fold=0_seed=2017_2017-07-13-19-21.csv',
 '/Users/vfomin/Documents/ML/Kaggle/PlanetAmazonRainForest/common/../output/pr

In [43]:
predictions = []
for csv_filepath in predictions_csv:
    predictions.append(pd.read_csv(csv_filepath))

Median

In [44]:
median_predictions_df = compute_median(predictions)

In [45]:
median_predictions_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,file_0,0.008977,7.8e-05,0.00085,0.022292,0.001696,0.986433,0.00033,6.5e-05,0.004424,0.001434,0.009062,0.001812,0.999739,0.005604,0.004553,8.3e-05,0.006899
1,file_1,0.029296,0.000643,0.003336,0.027415,0.006693,0.988783,0.000508,0.000671,0.019046,0.005056,0.00118,0.0093,0.998831,0.011104,0.011247,0.001549,0.015823
2,file_10,0.178512,4.3e-05,0.001655,0.000184,0.000128,0.004984,0.000884,4.8e-05,0.015183,0.002361,0.000218,0.993686,0.998364,0.016443,0.00046,0.000393,0.164282
3,file_100,0.396088,0.000227,0.003811,0.00089,0.000345,0.99616,3e-05,0.000171,0.235319,0.009526,0.000961,0.003516,0.999788,0.71702,0.014289,0.003332,0.60421
4,file_1000,0.98076,7.1e-05,0.003274,0.002899,0.000809,0.994291,8.1e-05,0.000199,0.194052,0.028619,0.003471,0.007176,0.99708,0.924646,0.002501,0.003689,0.067166


In [46]:
thresholds = {}

for i, tag in enumerate(unique_tags):
    thresholds[tag] = optimal_thresholds[i]

In [47]:
create_submission(median_predictions_df, info="vgg19_weirdcnn_median_with_best_thresholds", thresholds=thresholds)

## Train xgboost trees to make better predictions

### Train to predict weather classes : 
`weather_labels = ['clear', 'cloudy', 'haze', 'partly_cloudy']`

In [89]:
weather_labels = ['clear', 'cloudy', 'haze', 'partly_cloudy']

In [357]:
val_predictions_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,...,target_conventional_mine,target_cultivation,target_habitation,target_haze,target_partly_cloudy,target_primary,target_road,target_selective_logging,target_slash_burn,target_water
0,test_628,0.071995,0.01981,0.018135,0.02046,0.015155,0.950393,0.018868,0.034112,0.073635,...,0,0,0,0,0,1,0,0,0,0
1,test_8842,0.07047,0.024685,0.020838,0.020732,0.016295,0.946937,0.013844,0.024798,0.065312,...,0,0,0,0,0,1,0,0,0,0
2,test_11877,0.760127,0.092261,0.118277,0.09968,0.101711,0.741155,0.107806,0.115136,0.207027,...,0,0,1,0,0,1,1,0,0,0
3,test_2413,0.229211,0.058638,0.096851,0.11294,0.040866,0.83326,0.051784,0.093998,0.156023,...,0,0,0,0,0,1,0,0,0,1
4,test_28838,0.071773,0.017712,0.019611,0.027369,0.016949,0.962932,0.013631,0.027329,0.066544,...,0,0,0,0,0,1,0,0,0,0


In [26]:
import xgboost as xgb

#### A simple try of xgb

In [27]:
from sklearn.model_selection import KFold

In [28]:
n_folds = 3

In [31]:
kf = KFold(n_splits=n_folds)

trainval_x = val_predictions_df[unique_tags].values
trainval_y = val_predictions_df[target_tags].values
    
for train_index, test_index in kf.split(trainval_x):
    train_x, val_x = trainval_x[train_index], trainval_x[test_index]
    train_y, val_y = trainval_y[train_index], trainval_y[test_index]
    
#     print(train_x.shape, train_y.shape)
#     print(train_x[:5, :], train_y[:5])
#     print(val_x[:5, :], val_y[:5])
    break

In [18]:
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "error",
    "learning_rate": 0.01,
    "tree_method": 'exact',
    "n_estimators": 150,
    "max_depth": 3,
#     "subsample": subsample,
#     "colsample_bytree": colsample_bytree,
    "silent": False,    
}
num_boost_round = 2500
early_stopping_rounds = 100

In [19]:
tag_index = 2

In [21]:
sumpw = val_predictions_df[target_tags[tag_index]].sum()
sumnw = len(val_predictions_df[target_tags[tag_index]]) - sumpw    
scale_pos_weight = sumnw * 1.0 / sumpw

params['scale_pos_weight'] = scale_pos_weight

In [22]:
dtrain = xgb.DMatrix(train_x, train_y[:, tag_index])
dval = xgb.DMatrix(val_x, val_y[:, tag_index])

In [23]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [24]:
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

[0]	train-error:0.261858	eval-error:0.260665
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 100 rounds.
[1]	train-error:0.261858	eval-error:0.260665
[2]	train-error:0.261858	eval-error:0.260665
[3]	train-error:0.261858	eval-error:0.260665
[4]	train-error:0.261858	eval-error:0.260665
[5]	train-error:0.261858	eval-error:0.260665
[6]	train-error:0.261858	eval-error:0.260665
[7]	train-error:0.25634	eval-error:0.258247
[8]	train-error:0.261858	eval-error:0.260665
[9]	train-error:0.25634	eval-error:0.258247
[10]	train-error:0.25634	eval-error:0.258247
[11]	train-error:0.25634	eval-error:0.258247
[12]	train-error:0.25634	eval-error:0.258247
[13]	train-error:0.25634	eval-error:0.258247
[14]	train-error:0.25634	eval-error:0.258247
[15]	train-error:0.25634	eval-error:0.258247
[16]	train-error:0.25634	eval-error:0.258247
[17]	train-error:0.25634	eval-error:0.258247
[18]	train-error:0.25634	eval-error:0.258247
[

In [25]:
y_preds_ = gbm.predict(xgb.DMatrix(trainval_x), ntree_limit=gbm.best_iteration+1)

In [26]:
search_best_threshold(y_true[:, tag_index], y_preds_)

(0.54000000000000004, 0.25030916502542022)

In [28]:
search_best_threshold(y_true[:, tag_index], val_predictions_df[unique_tags].values[:, tag_index])

(0.080000000000000002, 0.19706884016377763)

In [30]:
thr1 = 0.540
thr2 = 0.08
st = 40
end = 80
print((y_preds_[st:end] > thr1).astype(np.uint8)) 
print((val_predictions_df[unique_tags].values[st:end, tag_index] > thr2).astype(np.uint8))
print(y_true[st:end, tag_index])

[0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 0]
[0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [315]:
score(y_true[:, tag_index], df[unique_tags[tag_index]].values > 0.09)

0.19568799780280144

Run CV with a random search of optimal parameters

In [68]:
import matplotlib.pylab as plt
%matplotlib inline

In [29]:
def generate_params(iter_num):
    if iter_num > 0:
        for z in range(iter_num):
            print("\n-- Iteration: {}".format(z))
            eta = np.random.uniform(0.05, 0.001)
            max_depth = np.random.randint(2, 6)
            subsample = np.random.uniform(0.5, 0.95)
            colsample_bytree = np.random.uniform(0.5, 0.95)
            yield eta, max_depth, subsample, colsample_bytree
    else:
        eta = 0.05
        max_depth = 3
        subsample = 0.8204967474962096
        colsample_bytree = 0.7089159774987868
        yield eta, max_depth, subsample, colsample_bytree

In [53]:
#eta_values = [0.05, 0.025, 0.01, 0.0075, 0.005, 0.0025, 0.001]

eval_metric = 'logloss'


best_params_for_tag_index = {}
for tag_index in range(0, len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))    
    seed = 2017
    n_folds = 5
    dtrainval = xgb.DMatrix(trainval_x, label=trainval_y[:, tag_index], feature_names=unique_tags)

    best_params_for_tag_index[tag_index] = {
        'test-%s-mean' % eval_metric: 1e10,
        'params': None,
    }
    
    sumpw = val_predictions_df[target_tags[tag_index]].sum()
    sumnw = len(val_predictions_df[target_tags[tag_index]]) - sumpw    
    scale_pos_weight = sumnw * 1.0 / sumpw
    
    iter_num = 15
#     iter_num = -1   
    gen = generate_params(iter_num)
    z = 0
    for (eta, max_depth, subsample, colsample_bytree) in gen:
        z += 1
        seed += z-1
        print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth,subsample,colsample_bytree))
        params = {
            "objective": "binary:logistic",
            "booster": "gbtree",
            "eval_metric": eval_metric
            ,
            "eta": eta,
            "tree_method": 'exact',
            "max_depth": max_depth,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "silent": 1,
            "seed": seed, 
            "scale_pos_weight": scale_pos_weight,
        }
        num_boost_round = 2000
        early_stopping_rounds = 100

        cvresult = xgb.cv(params, dtrain=dtrainval,
                           seed=params['seed'], 
                           num_boost_round=num_boost_round, 
                           early_stopping_rounds=early_stopping_rounds, nfold=n_folds, verbose_eval=False)
                
        min_test_logloss_mean = cvresult['test-%s-mean' % params['eval_metric']].min()
        if best_params_for_tag_index[tag_index]['test-%s-mean' % params['eval_metric']] > min_test_logloss_mean:
            best_params_for_tag_index[tag_index]['test-%s-mean' % params['eval_metric']] = min_test_logloss_mean
            best_params_for_tag_index[tag_index]['params'] = params
            print("Best cv result: ", cvresult.loc[cvresult.index[-1], :])
            print("Best params: ", params)
            
        


----------------
 Tag index: 0

-- Iteration: 0
XGBoost params. ETA: 0.023096741976626596, MAX_DEPTH: 2, SUBSAMPLE: 0.9294778786611162, COLSAMPLE_BY_TREE: 0.684612705671509
Best cv result:  test-logloss-mean     0.246641
test-logloss-std      0.009194
train-logloss-mean    0.233773
train-logloss-std     0.002104
Name: 1209, dtype: float64
Best params:  {'seed': 2017, 'scale_pos_weight': 2.2825856875356183, 'booster': 'gbtree', 'subsample': 0.9294778786611162, 'eta': 0.023096741976626596, 'silent': 1, 'objective': 'binary:logistic', 'colsample_bytree': 0.684612705671509, 'tree_method': 'exact', 'eval_metric': 'logloss', 'max_depth': 2}

-- Iteration: 1
XGBoost params. ETA: 0.023096741976626596, MAX_DEPTH: 2, SUBSAMPLE: 0.9294778786611162, COLSAMPLE_BY_TREE: 0.684612705671509

-- Iteration: 2
XGBoost params. ETA: 0.020594509346487103, MAX_DEPTH: 3, SUBSAMPLE: 0.8326168479972074, COLSAMPLE_BY_TREE: 0.8939410323228981
Best cv result:  test-logloss-mean     0.245555
test-logloss-std      0

KeyboardInterrupt: 

In [75]:
import pickle 
now = datetime.now()

xgb_best_params_filepath = os.path.join(GENERATED_DATA, 'xgb_best_params_%s.pkl' % str(now.strftime("%Y-%m-%d-%H-%M")))

with open(xgb_best_params_filepath, 'wb') as handle:
    pickle.dump(best_params_for_tag_index, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [36]:
best_params_for_tag_index = _best_params_for_tag_index

Train 17 binary classifiers

In [37]:
from sklearn.model_selection import train_test_split

for tag_index in range(len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))
    train_x, val_x, train_y, val_y = train_test_split(trainval_x, trainval_y, train_size=0.85)
    dtrain = xgb.DMatrix(train_x, train_y[:, tag_index])
    dval = xgb.DMatrix(val_x, val_y[:, tag_index])
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    params = best_params_for_tag_index[tag_index]['params']
    num_boost_round = 2500    
    early_stopping_rounds = 12

    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)                
    print("Best score: ", gbm.best_score)
    best_params_for_tag_index[tag_index]['gbm'] = gbm



----------------
 Tag index: 0
Best score:  0.108135

----------------
 Tag index: 1
Best score:  0.010417

----------------
 Tag index: 2
Best score:  0.166832

----------------
 Tag index: 3
Best score:  0.08168

----------------
 Tag index: 4
Best score:  0.062996

----------------
 Tag index: 5
Best score:  0.053406

----------------
 Tag index: 6
Best score:  0.022817

----------------
 Tag index: 7
Best score:  0.034226

----------------
 Tag index: 8
Best score:  0.205192

----------------
 Tag index: 9
Best score:  0.095734

----------------
 Tag index: 10
Best score:  0.084821

----------------
 Tag index: 11
Best score:  0.03588

----------------
 Tag index: 12
Best score:  0.058036

----------------
 Tag index: 13
Best score:  0.092097

----------------
 Tag index: 14
Best score:  0.076058

----------------
 Tag index: 15
Best score:  0.168485

----------------
 Tag index: 16
Best score:  0.100198


Compute best thresholds

In [38]:
best_thresholds = {}

for tag_index, tag in enumerate(unique_tags):
        
    dmat = xgb.DMatrix(trainval_x)
    gbm = best_params_for_tag_index[tag_index]['gbm']
    y_preds_ = gbm.predict(dmat, ntree_limit=gbm.best_iteration+1)        
    
    best_thresholds[tag], best_score = search_best_threshold(y_true[:, tag_index], y_preds_)
    print("%s | best threshold : %f with score: %f" % (tag, best_thresholds[tag], best_score))


agriculture | best threshold : 0.490000 with score: 0.893838
artisinal_mine | best threshold : 0.540000 with score: 0.800101
bare_ground | best threshold : 0.650000 with score: 0.481097
blooming | best threshold : 0.580000 with score: 0.348643
blow_down | best threshold : 0.530000 with score: 0.152452
clear | best threshold : 0.220000 with score: 0.979285
cloudy | best threshold : 0.630000 with score: 0.916326
conventional_mine | best threshold : 0.540000 with score: 0.325991
cultivation | best threshold : 0.520000 with score: 0.688206
habitation | best threshold : 0.520000 with score: 0.780279
haze | best threshold : 0.530000 with score: 0.778034
partly_cloudy | best threshold : 0.500000 with score: 0.949168
primary | best threshold : 0.250000 with score: 0.990040
road | best threshold : 0.500000 with score: 0.861935
selective_logging | best threshold : 0.710000 with score: 0.462641
slash_burn | best threshold : 0.540000 with score: 0.221661
water | best threshold : 0.450000 with scor

Boost test predictions

In [39]:
predictions_csv = glob(os.path.join(OUTPUT_PATH, "predictions_*2017-07-13-19-21.csv"))

prediction_df = pd.read_csv(predictions_csv[0]).dropna()
for filepath in predictions_csv[1:]:
    prediction_df = pd.concat([prediction_df, pd.read_csv(filepath).dropna()])
prediction_df.reset_index(inplace=True)   
prediction_df.drop('index', axis=1, inplace=True)

In [40]:
prediction_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,test_40308,0.980499,5.34357e-06,0.012761,1.083201e-06,1.659149e-05,0.026706,6.184839e-06,0.0001213845,0.247138,0.230338,0.00278,0.955976,0.994307,0.5091,6.428061e-05,0.005902686,0.049497
1,test_36168,0.008954,1.964079e-09,0.000126,0.0004806151,5.747763e-05,0.990458,1.74133e-08,7.339324e-10,0.002185,0.001168,0.00654,0.000672,0.999985,0.002998,0.0002039154,6.675383e-07,0.00499
2,test_6070,0.99853,1.298099e-10,0.004025,1.012986e-10,1.42529e-09,0.999737,2.468224e-13,2.351865e-06,0.095436,0.819567,0.000283,3.6e-05,0.947903,0.991666,2.751586e-07,0.0002056249,0.032477
3,test_5483,0.003073,3.214428e-09,5.3e-05,0.03451585,0.001510115,0.997953,1.104697e-08,2.678857e-10,0.001081,0.000312,0.000302,0.001118,0.999984,0.001043,0.003596786,6.073845e-07,0.00227
4,test_5532,0.004007,6.949826e-08,0.000144,0.1529595,0.006396817,0.998306,3.806774e-08,3.966572e-09,0.001861,0.000672,0.000304,0.000979,0.999972,0.002848,0.02929822,2.036211e-06,0.003847


In [41]:
y_preds_init = prediction_df[unique_tags].values
y_preds = np.zeros_like(y_preds_init)

for tag_index in range(len(unique_tags)):
    
    print("\n----------------\n Tag index: {}".format(tag_index))
    dtest = xgb.DMatrix(y_preds_init)
    gbm = best_params_for_tag_index[tag_index]['gbm']
    y_preds_ = gbm.predict(dtest, ntree_limit=gbm.best_iteration+1)        
    y_preds[:, tag_index] = y_preds_



----------------
 Tag index: 0

----------------
 Tag index: 1

----------------
 Tag index: 2

----------------
 Tag index: 3

----------------
 Tag index: 4

----------------
 Tag index: 5

----------------
 Tag index: 6

----------------
 Tag index: 7

----------------
 Tag index: 8

----------------
 Tag index: 9

----------------
 Tag index: 10

----------------
 Tag index: 11

----------------
 Tag index: 12

----------------
 Tag index: 13

----------------
 Tag index: 14

----------------
 Tag index: 15

----------------
 Tag index: 16


In [42]:
df = prediction_df.copy()
df[unique_tags] = y_preds

In [43]:
df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,test_40308,0.522231,0.45251,0.252149,0.389114,0.455467,0.16127,0.123053,0.455544,0.522468,0.544217,0.44323,0.866382,0.750729,0.503381,0.268825,0.499394,0.248423
1,test_36168,0.47712,0.45251,0.15965,0.389114,0.455467,0.807806,0.122661,0.455544,0.391684,0.410128,0.443108,0.1369,0.729628,0.494106,0.260546,0.451676,0.141758
2,test_6070,0.522231,0.45251,0.222007,0.389114,0.455467,0.83,0.13256,0.455544,0.527279,0.583079,0.440586,0.190396,0.609088,0.505878,0.259275,0.451676,0.23831
3,test_5483,0.47712,0.45251,0.158208,0.533957,0.478718,0.837238,0.122661,0.455544,0.39145,0.412522,0.43547,0.134498,0.76162,0.494106,0.32732,0.451676,0.127105
4,test_5532,0.47712,0.45251,0.16117,0.601405,0.455475,0.837238,0.122661,0.455544,0.404547,0.414466,0.43547,0.135214,0.76162,0.494106,0.620745,0.451676,0.132539


In [44]:
def compute_mean(df):
    gb = df.groupby('image_name')
    df2 = gb.agg(np.mean).reset_index()
    return df2

In [45]:
mean_df = compute_mean(df)

In [46]:
len(mean_df), len(df)

(61191, 183573)

In [47]:
create_submission(mean_df, info="squeezenet21_blended_3_folds", thresholds=best_thresholds)

## Storages

Last best

In [35]:
_best_params_for_tag_index = {
0: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9097441644531741,
'eta': 0.012063976013602831,
'eval_metric': 'error',
'max_depth': 3,
'objective': 'binary:logistic',
'scale_pos_weight': 2.2953291651342407,
'seed': 2027,
'silent': 1,
'subsample': 0.8135263419352342,
'tree_method': 'exact'},
'test-error-mean': 0.18432560000000001},
1: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9247718213959808,
'eta': 0.048774868242772856,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 119.17883755588673,
'seed': 2045,
'silent': 1,
'subsample': 0.8232135887360947,
'tree_method': 'exact'},
'test-error-mean': 0.0096606000000000018},
2: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 45.478386167146972,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.035527999999999997},
3: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 117.76288659793815,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.0097225999999999996},
4: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 404.22613065326635,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.00248},
5: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 0.42622167984294584,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.13793419999999998},
6: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 18.122599004031301,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.032638800000000003},
7: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 383.0,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.0027032000000000002},
8: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 8.2033782241497377,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.22521079999999999},
9: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 10.04960263085777,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.1460564},
10: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.5079279690320685,
'eta': 0.04557168327562415,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 13.936099277644008,
'seed': 2108,
'silent': 1,
'subsample': 0.8351662492654948,
'tree_method': 'exact'},
'test-error-mean': 0.065190800000000007},
11: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 4.569060773480663,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.1443574},
12: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 0.080400326906844941,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.065996799999999994},
13: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9097441644531741,
'eta': 0.012063976013602831,
'eval_metric': 'error',
'max_depth': 3,
'objective': 'binary:logistic',
'scale_pos_weight': 4.051049170059505,
'seed': 2027,
'silent': 1,
'subsample': 0.8135263419352342,
'tree_method': 'exact'},
'test-error-mean': 0.17645080000000002},
14: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.9247718213959808,
'eta': 0.048774868242772856,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 124.21739130434783,
'seed': 2045,
'silent': 1,
'subsample': 0.8232135887360947,
'tree_method': 'exact'},
'test-error-mean': 0.0087554},
15: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 207.9119170984456,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.0049351999999999998},
16: {'params': {'booster': 'gbtree',
'colsample_bytree': 0.8393513697801558,
'eta': 0.04965712898037271,
'eval_metric': 'error',
'max_depth': 5,
'objective': 'binary:logistic',
'scale_pos_weight': 4.4593460158418523,
'seed': 2083,
'silent': 1,
'subsample': 0.766586509402208,
'tree_method': 'exact'},
'test-error-mean': 0.25375720000000002}}

In [122]:
_best_params_for_tag_index = {
0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38209179999999998},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.036067200000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.6334399849809398,
   'eta': 0.15055304521905347,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2019,
   'silent': 1,
   'subsample': 0.6186701247268119,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.086130599999999988},
 3: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.041870600000000001},
 4: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2017,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.015638200000000001},
 5: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.30505919999999997},
 6: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.068178600000000006},
 7: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2017,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.013706000000000001},
 8: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.8579978891053311,
   'eta': 0.18574545610881862,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2044,
   'silent': 1,
   'subsample': 0.8611335020320647,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.29456899999999997},
 9: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.20715659999999997},
 10: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.7089159774987868,
   'eta': 0.20210354378354375,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.8204967474962096,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.1200102},
 11: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.29197859999999998},
 12: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.1244864},
 13: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5045164695165618,
   'eta': 0.1428101678052025,
   'eval_metric': 'logloss',
   'max_depth': 3,
   'objective': 'binary:logistic',
   'seed': 2016,
   'silent': 1,
   'subsample': 0.7215305289307261,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.3228318},
 14: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.8398837534294123,
   'eta': 0.14522170765829945,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2022,
   'silent': 1,
   'subsample': 0.8734804475952236,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.041907600000000003},
 15: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.586741114680066,
   'eta': 0.08503519422605446,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2052,
   'silent': 1,
   'subsample': 0.9395104239451562,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.027125},
 16: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38378999999999996}
}

{0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.38209179999999998},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5813435352146873,
   'eta': 0.12653607181296217,
   'eval_metric': 'logloss',
   'max_depth': 4,
   'objective': 'binary:logistic',
   'seed': 2037,
   'silent': 1,
   'subsample': 0.537796332804125,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.036067200000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.6334399849809398,
   'eta': 0.15055304521905347,
   'eval_metric': 'logloss',
   'max_depth': 2,
   'objective': 'binary:logistic',
   'seed': 2019,
   'silent': 1,
   'subsample': 0.6186701247268119,
   'tree_method': 'exact'},
  'test_logloss_mean': 0.0861305999999

Last saved

In [48]:
best_params_for_tag_index = {0: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5829334344789511,
   'eta': 0.024988972967877485,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 2.2825856875356183,
   'seed': 2038,
   'silent': 1,
   'subsample': 0.7387515922133331,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.24204939999999997},
 1: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5151974580887335,
   'eta': 0.02653115436725616,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 114.2,
   'seed': 2023,
   'silent': 1,
   'subsample': 0.8162897671482163,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.017903800000000001},
 2: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.9152266627015204,
   'eta': 0.03617273607658269,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 43.849833147942157,
   'seed': 2122,
   'silent': 1,
   'subsample': 0.6340682509577085,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.095588800000000002},
 3: {'params': {'booster': 'gbtree',
   'colsample_bytree': 0.5151974580887335,
   'eta': 0.02653115436725616,
   'eval_metric': 'logloss',
   'max_depth': 5,
   'objective': 'binary:logistic',
   'scale_pos_weight': 124.21739130434783,
   'seed': 2023,
   'silent': 1,
   'subsample': 0.8162897671482163,
   'tree_method': 'exact'},
  'test-logloss-mean': 0.042751600000000001}
}