In [1]:
import pandas as pd
import numpy as np
from keras import regularizers
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score

In [2]:
PATH = '../../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text_length = pd.concat([train_sentence.apply(lambda x: len(x.split())),\
                         test_sentence.apply(lambda x: len(x.split()))])

mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

In [3]:
# config
MAX_FEATURES = 100000 # max num of words
MAX_LEN = np.round(mean_length + 3*std_length).astype(int) # max sequence length
EMBED_SIZE = 50 # embedding size
FILTERS = 128 # cnn config
KERNEL_SIZE = 7 #cnn config
DENSE_UNITS = 50
DROPOUT = 0.3 # dropout rate
BATCH_SIZE = 32
EPOCHS = 2
EMBEDDING_FILE = 'glove.6B.50d.txt'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print('done')

In [4]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(pd.concat([train_sentence, test_sentence]).values)
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
tokenized_test = tokenizer.texts_to_sequences(test_sentence.values)

X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
# y = train[label_cols].values
X_test = sequence.pad_sequences(tokenized_test, maxlen=MAX_LEN)

print(tokenized_train[0])

In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_cnn_model(embedding_file, embed_size, max_features, tokenizer,\
                  kernel_size, filters, dropout, dense_units, max_len, label_cols, output_size):
    embedding_matrix = get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(input)
    x = Conv1D(FILTERS, kernel_size, padding='same', activation='relu')(x)
    x = MaxPooling1D()(x)
    x = Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(dropout)(x)
    x = Dense(dense_units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = Dense(output_size, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)
    
def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + file_name + '.csv', index=False)
    
print('done')

In [6]:
file_path = '../../model/cnn_best.hdf5'
sample_submission_file_path = PATH + 'sample_submission.csv'

preds = np.zeros((test.shape[0], len(label_cols)))
preds_train = np.zeros((train.shape[0], len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    y = train[j].to_frame()
    y['2'] = 1 - y
    y = y.values
    model = get_cnn_model(PATH + EMBEDDING_FILE, EMBED_SIZE, MAX_FEATURES, tokenizer,\
                          KERNEL_SIZE, FILTERS, DROPOUT, DENSE_UNITS, MAX_LEN, label_cols, 2)
    model = train_model(model, file_path, BATCH_SIZE, EPOCHS, X_train, y)
    preds[:, i] = predict(model, file_path, X_test)[:, 0]
    preds_train[:, i] = predict(model, file_path, X_train)[:, 0]
    print('accuracy: {}'.format(roc_auc_score(train[j], preds_train[:, i])))
    print('\n\n')

save('cnn', preds, label_cols, PATH)
save('cnn', preds_train, label_cols, PATH, True)

print('done')