In [21]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'logreg', 'cnn', 'lstm']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)

y = pd.read_csv(PATH + 'train.csv')[label_cols]

features = list(label_cols)

print(train.columns.values)
print('done')

../../data/nblogreg/train_nblogreg.csv
../../data/nblogreg/nblogreg.csv
../../data/logreg/train_logreg.csv
../../data/logreg/logreg.csv
../../data/cnn/train_cnn.csv
../../data/cnn/cnn.csv
../../data/lstm/train_lstm.csv
../../data/lstm/lstm.csv
['toxic' 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate' 'toxic'
 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate' 'toxic'
 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate' 'toxic'
 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate']
done


In [24]:
def print_feature_importance(x, features, models):
    dict = {}
    for i in range(len(x)):
        which_model = models[i // len(features)]
        which_feature = features[i % len(features)]
        dict[(which_model + ' ' + which_feature)] = x[i]
    print(dict)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit ' + j)
    ensemble = CatBoostClassifier(iterations=3,
                                  depth=10, 
                                  learning_rate=0.001, 
                                  loss_function='CrossEntropy',
                                  eval_metric='AUC')
    ensemble.fit(X_train[j], y_train[j], use_best_model=True, eval_set=[X_test[j], y_test[j]])
#     print_feature_importance(ensemble.get_feature_importance(X_train, y_train[j]), features, model)
    out[:, i] = ensemble.predict_proba(test.values)[:, 0] # TODO: should ues 1 or 0?

print('done')

fit toxic
0:	learn: 0.9908660	test: 0.9909193	best: 0.9909193 (0)	total: 623ms	remaining: 1.25s
1:	learn: 0.9932177	test: 0.9931720	best: 0.9931720 (1)	total: 1.25s	remaining: 624ms
2:	learn: 0.9947389	test: 0.9947188	best: 0.9947188 (2)	total: 1.87s	remaining: 0us

bestTest = 0.9947188388
bestIteration = 2

Shrink model to first 3 iterations.
fit severe_toxic
0:	learn: 0.9935309	test: 0.9927865	best: 0.9927865 (0)	total: 399ms	remaining: 798ms
1:	learn: 0.9977643	test: 0.9976600	best: 0.9976600 (1)	total: 953ms	remaining: 477ms
2:	learn: 0.9980051	test: 0.9979694	best: 0.9979694 (2)	total: 1.48s	remaining: 0us

bestTest = 0.9979693688
bestIteration = 2

Shrink model to first 3 iterations.
fit obscene
0:	learn: 0.9939165	test: 0.9945320	best: 0.9945320 (0)	total: 674ms	remaining: 1.35s
1:	learn: 0.9958896	test: 0.9961944	best: 0.9961944 (1)	total: 1.3s	remaining: 651ms
2:	learn: 0.9961478	test: 0.9964769	best: 0.9964769 (2)	total: 1.99s	remaining: 0us

bestTest = 0.9964768791
bestItera

In [25]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/catboost_ensemble.csv', index=False)
print('done')

done
