In [1]:
# Playing with submitting an average of high scoring kernels, as it seems
# everyone else is doing. Weights were determined by guessing based on
# their relative public leaderboard scores.
#
# This probably overfits, and proper work would involve running all these
# kernels with CV and determining the proper weights on the OOF predictions.

import numpy as np
import pandas as pd

gru = pd.read_csv('../input/average_ensemble/pool_gru.csv') # PL score 0.9829
lstm_nb_svm = pd.read_csv('../input/average_ensemble/lstm_nbsvm.csv') # 0.9811
lr = pd.read_csv('../input/average_ensemble/lr.csv') # 0.9788
lgb = pd.read_csv('../input/average_ensemble/lgb.csv') # 0.9785

In [2]:
# Bojan suggests scaling with min-max to make sure that all the submissions have
# orderings that can be compared. Since our metric is AUC, this is okay and may
# improve performance.
from sklearn.preprocessing import minmax_scale
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for label in labels:
    print('Scaling {}... Please stand by.'.format(label))
    lgb[label] = minmax_scale(lgb[label])
    gru[label] = minmax_scale(gru[label])
    lr[label] = minmax_scale(lr[label])
    lstm_nb_svm[label] = minmax_scale(lstm_nb_svm[label])

Scaling toxic... Please stand by.
Scaling severe_toxic... Please stand by.
Scaling obscene... Please stand by.
Scaling threat... Please stand by.
Scaling insult... Please stand by.
Scaling identity_hate... Please stand by.


In [3]:
# The value of an ensemble is (a) the individual scores of the models and
# (b) their correlation with one another. We want multiple individually high
# scoring models that all have low correlations. Based on this analysis, it
# looks like these kernels have relatively low correlations and will blend to a
# much higher score.
for label in labels:
    print(label)
    print(np.corrcoef([lgb[label], gru[label], lr[label], lstm_nb_svm[label]]))

toxic
[[1.         0.90111066 0.95377901 0.94273615]
 [0.90111066 1.         0.90236963 0.94926809]
 [0.95377901 0.90236963 1.         0.95823705]
 [0.94273615 0.94926809 0.95823705 1.        ]]
severe_toxic
[[1.         0.78884913 0.86371851 0.8186413 ]
 [0.78884913 1.         0.82051887 0.85339415]
 [0.86371851 0.82051887 1.         0.88252021]
 [0.8186413  0.85339415 0.88252021 1.        ]]
obscene
[[1.         0.93439385 0.94986687 0.94607069]
 [0.93439385 1.         0.92729218 0.96100403]
 [0.94986687 0.92729218 1.         0.955631  ]
 [0.94607069 0.96100403 0.955631   1.        ]]
threat
[[1.         0.75896158 0.83580206 0.79585943]
 [0.75896158 1.         0.79996174 0.78330735]
 [0.83580206 0.79996174 1.         0.84338327]
 [0.79585943 0.78330735 0.84338327 1.        ]]
insult
[[1.         0.89161567 0.92276276 0.90977553]
 [0.89161567 1.         0.88996881 0.93182386]
 [0.92276276 0.88996881 1.         0.93925792]
 [0.90977553 0.93182386 0.93925792 1.        ]]
identity_hate


In [4]:
submission = pd.DataFrame()
submission['id'] = lgb['id']
submission['toxic'] = lgb['toxic'] * 0.15 + gru['toxic'] * 0.4 + lr['toxic'] * 0.15 + lstm_nb_svm['toxic'] * 0.3
submission['severe_toxic'] = lgb['severe_toxic'] * 0.15 + gru['severe_toxic'] * 0.4 + lr['severe_toxic'] * 0.15 + lstm_nb_svm['severe_toxic'] * 0.3
submission['obscene'] = lgb['obscene'] * 0.15 + gru['obscene'] * 0.4 + lr['obscene'] * 0.15 + lstm_nb_svm['obscene'] * 0.3
submission['threat'] = lgb['threat'] * 0.15 + gru['threat'] * 0.4 + lr['threat'] * 0.15 + lstm_nb_svm['threat'] * 0.3
submission['insult'] = lgb['insult'] * 0.15 + gru['insult'] * 0.4 + lr['insult'] * 0.15 + lstm_nb_svm['insult'] * 0.3
submission['identity_hate'] = lgb['identity_hate'] * 0.15 + gru['identity_hate'] * 0.4 + lr['identity_hate'] * 0.15 + lstm_nb_svm['identity_hate'] * 0.3
submission.to_csv('../output/average_ensemble_0228.csv', index=False)