In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'nbnn', 'cnn', 'lstm']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)    

other_feature_cols = ['word_count', 'unique_word_count', 'consecutive_question_marks',\
                      'consecutive_exclamation_marks', 'uppercase_letters', 'ellipsis',\
                      'period', 'parentheses_paird', 'cleaned_word_count', 'cleaned_unique_word_count',\
                      'cleaned_consecutive_question_marks', 'cleaned_consecutive_exclamation_marks',\
                      'cleaned_uppercase_letters', 'cleaned_ellipsis', 'cleaned_period', 'cleaned_parentheses_pair']

print('other features')
train = pd.concat([train, pd.read_csv(PATH + 'cleaned_train.csv')[other_feature_cols]], axis=1)
test = pd.concat([test, pd.read_csv(PATH + 'cleaned_test.csv')[other_feature_cols]], axis=1)

y = pd.read_csv(PATH + 'train.csv')[label_cols]

features = list(label_cols)
features.extend(other_feature_cols)

print('done')

../../data/nblogreg/train_nblogreg.csv
../../data/nblogreg/nblogreg.csv
../../data/nbnn/train_nbnn.csv
../../data/nbnn/nbnn.csv
../../data/cnn/train_cnn.csv
../../data/cnn/cnn.csv
../../data/lstm/train_lstm.csv
../../data/lstm/lstm.csv
other features
done


In [4]:
def print_feature_importance(x, features, models):
    dict = {}
    for i in range(len(x)):
        which_model = models[i // len(features)]
        which_feature = features[i % len(features)]
        dict[(which_model + ' ' + which_feature)] = x[i]
    print(dict)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit ' + j)
    ensemble = CatBoostClassifier(iterations=3,
                                  depth=10, 
                                  learning_rate=0.001, 
                                  loss_function='Logloss')
    ensemble.fit(X_train, y_train[j], use_best_model=True, eval_set=[X_test, y_test[j]])
    print_feature_importance(ensemble.get_feature_importance(X_train, y_train[j]), features, model)
    out[:, i] = ensemble.predict_proba(test.values)[:, 0] # TODO: should ues 1 or 0?

print('done')

fit toxic
0:	learn: 0.6897887	test: 0.6897824	best: 0.6897824 (0)	total: 527ms	remaining: 1.05s
1:	learn: 0.6870233	test: 0.6870154	best: 0.6870154 (1)	total: 1s	remaining: 503ms
2:	learn: 0.6845811	test: 0.6845638	best: 0.6845638 (2)	total: 1.46s	remaining: 0us

bestTest = 0.6845638022
bestIteration = 2

Shrink model to first 3 iterations.
{'nblogreg toxic': 0.7672547441218098, 'nblogreg severe_toxic': 0.0, 'nblogreg obscene': 1.143144402471642, 'nblogreg threat': 3.5403910212980323, 'nblogreg insult': 0.0, 'nblogreg identity_hate': 1.7272417196716312, 'nblogreg word_count': 5.85103594175941, 'nblogreg unique_word_count': 0.0, 'nblogreg consecutive_question_marks': 0.0, 'nblogreg consecutive_exclamation_marks': 0.0, 'nblogreg uppercase_letters': 2.5836407435859696, 'nblogreg ellipsis': 0.9499030623221179, 'nblogreg period': 1.7205401964730045, 'nblogreg parentheses_paird': 21.061915251114925, 'nblogreg cleaned_word_count': 7.231817724268177, 'nblogreg cleaned_unique_word_count': 0.0, 

{'nblogreg toxic': 6.388874172247247, 'nblogreg severe_toxic': 0.40214236047163954, 'nblogreg obscene': 0.0, 'nblogreg threat': 3.134833721554143, 'nblogreg insult': 3.038370332510139, 'nblogreg identity_hate': 0.0, 'nblogreg word_count': 0.0, 'nblogreg unique_word_count': 0.0, 'nblogreg consecutive_question_marks': 3.255522278185245, 'nblogreg consecutive_exclamation_marks': 0.0, 'nblogreg uppercase_letters': 0.0, 'nblogreg ellipsis': 0.0, 'nblogreg period': 6.8109527366333324, 'nblogreg parentheses_paird': 13.586071844291766, 'nblogreg cleaned_word_count': 51.306169731462646, 'nblogreg cleaned_unique_word_count': 0.0, 'nblogreg cleaned_consecutive_question_marks': 0.5748868170052024, 'nblogreg cleaned_consecutive_exclamation_marks': 0.0, 'nblogreg cleaned_uppercase_letters': 0.8736411416459646, 'nblogreg cleaned_ellipsis': 1.4958807230387594, 'nblogreg cleaned_period': 0.0, 'nblogreg cleaned_parentheses_pair': 0.0, 'nbnn toxic': 2.5333972823804745, 'nbnn severe_toxic': 0.0, 'nbnn obs

In [5]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/catboost_ensemble.csv', index=False)
print('done')

done
