In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'nbnn', 'cnn', 'lstm']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)    

other_feature_cols = ['word_count', 'unique_word_count', 'consecutive_question_marks',\
                      'consecutive_exclamation_marks', 'uppercase_letters', 'ellipsis',\
                      'period', 'parentheses_paird', 'cleaned_word_count', 'cleaned_unique_word_count',\
                      'cleaned_consecutive_question_marks', 'cleaned_consecutive_exclamation_marks',\
                      'cleaned_uppercase_letters', 'cleaned_ellipsis', 'cleaned_period', 'cleaned_parentheses_pair']

print('other features')
train = pd.concat([train, pd.read_csv(PATH + 'cleaned_train.csv')[other_feature_cols]], axis=1)
test = pd.concat([test, pd.read_csv(PATH + 'cleaned_test.csv')[other_feature_cols]], axis=1)

y = pd.read_csv(PATH + 'train.csv')[label_cols]

features = list(label_cols)
features.extend(other_feature_cols)

print('done')

In [2]:
def print_feature_importance(x, features, models):
    dict = {}
    for i in range(len(x)):
        which_model = models[i // len(features)]
        which_feature = features[i % len(features)]
        dict[(which_model + ' ' + which_feature)] = x[i]
    print(dict)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit ' + j)
    ensemble = CatBoostClassifier(iterations=3,
                                  depth=10, 
                                  learning_rate=0.001, 
                                  loss_function='Logloss')
    ensemble.fit(X_train, y_train[j], use_best_model=True, eval_set=[X_test, y_test[j]])
    print_feature_importance(ensemble.get_feature_importance(X_train, y_train[j]), features, model)
    out[:, i] = ensemble.predict_proba(test.values)[:, 0] # TODO: should ues 1 or 0?

print('done')

In [3]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/catboost_ensemble.csv', index=False)
print('done')