In [None]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import operator
import sys

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from textblob import TextBlob

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, Reshape, GlobalAveragePooling1D, merge, Flatten, Bidirectional, CuDNNGRU, add, Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [None]:
data_path = 'Dataset/'
EMBEDDING_FILE='features/fast-text-300.txt'
#EMBEDDING_FILE='features/glove.twitter.27B.200d.txt'
#EMBEDDING_FILE='features/glove.840B.300d.txt'
TRAIN_DATA_FILE = data_path + 'train.csv'
TEST_DATA_FILE = data_path + 'test.csv'

MAX_SEQUENCE_LENGTH = 350
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 200

In [None]:
#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE, 'r', encoding='utf-8')
for line in f:
    values = line.split()
    try:
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("Err on ", values[:3])
f.close()

print('Total %s word vectors.' % len(embeddings_index))

In [None]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

In [None]:
cl_path = 'features/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        print(line)
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

In [None]:
print('Processing text dataset')
from collections import defaultdict
# Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^?!.,:a-z\d ]',re.IGNORECASE)

# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
word_count_dict = defaultdict(int)
import re
toxic_dict = {}

def text_to_wordlist(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    text = re.sub(r"”", "", text)
    text = re.sub(r"“", "", text)
    text = replace_numbers.sub(' ', text)

    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    return (text)

list_sentences_train = train_df["comment_text_clean"].fillna("no comment").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text_clean"].fillna("no comment").values

comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
#tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

In [None]:
def sent2pos(sentence):
    try:
        tag = TextBlob(sentence).tags
    except:
        print(sentence)
        print(' '.join([word_index[word] for word in text]))

    updated_sentence = ' '.join([i[0] for i in tag])
    tagged = ' '.join([i[1] for i in tag])
#     print(len(updated_sentence.split(' ')),len(text2.split(' ')))
#     print(updated_sentence)
#     print(tagged)
    return updated_sentence, tagged
    
inverse_word_index = {v: k for k, v in word_index.items()}

Pos_comments = []
Pos_updated_sentence = []
for text in sequences:
    text1 = ' '.join([inverse_word_index[word] for word in text])
    if not isinstance(text1, str):
        print(text)
        print(text1)
    updated_sentence, text2 = sent2pos(text1)
    Pos_updated_sentence.append(updated_sentence)
    Pos_comments.append(text2)
    assert len(updated_sentence.split(' ')) == len(text2.split(' ')), "T1 {} T2 {} ".format(len(text), len(text2.split()))
    
Pos_test_comments = []
Pos_test_updated_sentence = []
for text in test_sequences:
    text1 = ' '.join([inverse_word_index[word] for word in text])
    updated_sentence, text2 = sent2pos(text1)
    Pos_test_updated_sentence.append(updated_sentence)
    Pos_test_comments.append(text2)
    assert len(updated_sentence.split(' ')) == len(text2.split(' ')), "T1 {} T2 {} ".format(len(text), len(text2.split()))

    
pos_tokenizer = Tokenizer(num_words=50, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
#tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

pos_tokenizer.fit_on_texts(Pos_comments + Pos_test_comments)

sequences = pos_tokenizer.texts_to_sequences(Pos_comments)
test_sequences = pos_tokenizer.texts_to_sequences(Pos_test_comments)

pos_word_index = tokenizer.word_index
print('Found %s unique tokens' % len(pos_word_index))

pos_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', pos_data.shape)
print('Shape of label tensor:', y.shape)

pos_test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', pos_test_data.shape)

## Second valid 


In [None]:
comments = []
for text in Pos_updated_sentence:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in Pos_test_updated_sentence:
    test_comments.append(text_to_wordlist(text))

# tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
# tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

In [None]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

null_words = open('null-word.txt', 'w', encoding='utf-8')

for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        null_words.write(word + ', ' + str(word_count_dict[word]) +'\n')
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

#24146

In [None]:
f = open('cleaned_text.txt', 'w', encoding='utf-8')
for line in test_comments:
    f.write(line + '\n')

In [None]:
# sort null word
null_count = {}
with open('null-word.txt', 'r', encoding='utf-8') as nullword:
    for line in nullword:
        w, c = line.strip('\n').split(', ')
        null_count[w] = int(c)

null_count = sorted(word_count_dict.items(), key=operator.itemgetter(1), reverse=True)

with open('null-word.txt', 'w', encoding='utf-8') as output:
    for w, c in null_count:
        output.write(w + ", " + str(c) + '\n')

# Model Zoo

In [None]:
from sklearn.metrics import roc_auc_score

import numpy as np

STAMP = 'pavel_rnn_%.2f_%.2f'%(0.5,0.5)

def _train_model_by_auc(model, batch_size, train_x, train_y, val_x, val_y):
    best_auc = -1
    best_weights = None
    best_epoch = 0

    current_epoch = 1

    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epochs=1, validation_data=[val_x, val_y])
        y_pred = model.predict(val_x, batch_size=batch_size)
        current_auc = roc_auc_score(val_y, y_pred)
        print("Epoch {} auc {:.6f} best_auc {:.6f}".format(current_epoch, current_auc, best_auc))
        current_epoch += 1
        if best_auc < current_auc or best_auc == -1:
            best_auc = current_auc
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == 5:
                break

    model.set_weights(best_weights)
    return model, best_auc

def _train_model_by_logloss(model, batch_size, train_x, pos_train_x, train_y, val_x, pos_val_x, val_y, fold_id):
    early_stopping =EarlyStopping(monitor='val_loss', patience=7)
    bst_model_path = STAMP + str(fold_id) + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
    train_data = {'Onehot':train_x, 'POS':pos_train_x}
    val_data = {'Onehot':val_x, 'POS':pos_val_x}
    hist = model.fit(train_data, train_y,
        validation_data=(val_data, val_y),
        epochs=50, batch_size=batch_size, shuffle=True,
        callbacks=[early_stopping, model_checkpoint])
    bst_val_score = min(hist.history['val_loss'])
    predictions = model.predict(val_data)
    auc = roc_auc_score(val_y, predictions)
    print("AUC Score", auc)
    return model, bst_val_score, auc, predictions

def train_folds(X, pos_x, y, fold_count, batch_size, get_model_func):
    fold_size = len(X) // fold_count
    models = []
    fold_predictions = []
    score = 0
    total_auc = 0
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])
        
        val_x = X[fold_start:fold_end]
        val_y = y[fold_start:fold_end]
        
        pos_train_x = np.concatenate([pos_x[:fold_start], pos_x[fold_end:]])
        
        pos_val_x = pos_x[fold_start:fold_end]
    
        print("In fold #", fold_id)
        model, bst_val_score, auc, fold_prediction = _train_model_by_logloss(get_model_func(), batch_size, train_x,
                                                                             pos_train_x, train_y, val_x, pos_val_x, 
                                                                             val_y, fold_id)
        score += bst_val_score
        total_auc += auc
        fold_predictions.append(fold_prediction)
        models.append(model)
    return models, score / fold_count, total_auc / fold_count, fold_predictions

In [None]:
from keras import optimizers
from keras.layers import Reshape
adam_optimizer = optimizers.Adam(lr=1e-3 ** 64/256, decay=1e-8)

def get_av_pos_cnn():
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)

    filter_nums = 325 # 500->375, 400->373, 300->
    drop = 0.5
    dr_rate = 0.5
    
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Onehot')
    input_layer_2 = Input(shape=(MAX_SEQUENCE_LENGTH,), name='POS')
    
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)(input_layer)
    
    embedding_layer2 = Embedding(50,
            30, # Latest Modify
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=True)(input_layer_2)

    embedding_layer = concatenate([embedding_layer, embedding_layer2], axis=2)
    embedded_sequences = SpatialDropout1D(0.25)(embedding_layer)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)

    attn_0 = AttentionWeightedAverage()(conv_0)
    avg_0 = GlobalAveragePooling1D()(conv_0)
    maxpool_0 = GlobalMaxPooling1D()(conv_0)
    
    maxpool_1 = GlobalMaxPooling1D()(conv_1)
    attn_1 = AttentionWeightedAverage()(conv_1)
    avg_1 = GlobalAveragePooling1D()(conv_1)    
    
    maxpool_2 = GlobalMaxPooling1D()(conv_2)
    attn_2 = AttentionWeightedAverage()(conv_2)
    avg_2 = GlobalAveragePooling1D()(conv_2)

    maxpool_3 = GlobalMaxPooling1D()(conv_3)
    attn_3 = AttentionWeightedAverage()(conv_3)
    avg_3 = GlobalAveragePooling1D()(conv_3)
    
    
    v0_col = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    v1_col = merge([attn_0, attn_1, attn_2, attn_3], mode='concat', concat_axis=1)
    v2_col = merge([avg_1, avg_2, avg_0, avg_3], mode='concat', concat_axis=1)
    merged_tensor = merge([v0_col, v1_col, v2_col], mode='concat', concat_axis=1)
    output = Dropout(0.7)(merged_tensor)
    output = Dense(units=144)(output)
    output = Activation('relu')(output)
    #output = Dropout(0.5)(output)
    output = Dense(units=6, activation='sigmoid')(output)

    model = Model(inputs=[input_layer, input_layer_2], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model



In [None]:
from keras import optimizers
adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=5, decay=1e-6)

def get_av_pos_rnn():
    recurrent_units = 56
    dropout_rate = 0.35
    dense_size = 32
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), name='Onehot')
    input_layer_2 = Input(shape=(MAX_SEQUENCE_LENGTH,), name='POS')
    
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)(input_layer)
    
    embedding_layer2 = Embedding(50,
            35, # Latest Modify
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=True)(input_layer_2)

    embedding_layer = concatenate([embedding_layer, embedding_layer2], axis=2)
    embedding_layer = SpatialDropout1D(0.2)(embedding_layer)
    
    r1 = Bidirectional(CuDNNGRU(64, return_sequences=True))(embedding_layer)
    #r1 = SpatialDropout1D(0.35)(r1) # Latest Modify
    #r2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(r1)
    #r2 = SpatialDropout1D(0.35)(r2)

    #rrs = concatenate([r1 ,r2], axis=-1)
    
    last_1 = Lambda(lambda t: t[:, -1])(r1)
    #last_2 = Lambda(lambda t: t[:, -1])(r2)
    maxpool = GlobalMaxPooling1D()(r1)
    attn = AttentionWeightedAverage()(r1)
    average = GlobalAveragePooling1D()(r1)
    
    concatenated = concatenate([maxpool, last_1,  attn, average,], axis=1)
    x = Dropout(0.5)(concatenated)
    x = Dense(144, activation="relu")(x)
    output_layer = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=[input_layer, input_layer_2], outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optimizer,
    metrics=['accuracy'])
    return model

In [None]:
models, val_loss, total_auc, fold_predictions = train_folds(data, pos_data, y, 10, 224, get_av_pos_rnn)

In [None]:
print("Overall val-loss:", val_loss, "AUC", total_auc) # RNN benchmark

## Predections

In [None]:
train_fold_preditcions = np.concatenate(fold_predictions, axis=0)

In [None]:
training_auc = roc_auc_score(y[:-1], train_fold_preditcions)
print("Training AUC", training_auc)

In [None]:
#test_data = test_df
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submit_path_prefix = "results/rnn/fasttext-nds-SC-POV-avrnn-Voc" + str(MAX_NB_WORDS) + "-fixedpuppet-skippandall-lp-ct-" + str(MAX_SEQUENCE_LENGTH) 

print("Predicting testing results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict({'Onehot':test_data, 'POS':pos_test_data}, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)
    np.save("predict_path/", test_predicts)

test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-L{:4f}-A{:4f}.csv".format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

In [None]:
print("Predicting training results...")

train_ids = train_df["id"].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_preditcions, columns=CLASSES) # IT MISS THE LAST ONE's label
train_predicts["id"] = train_ids[:-1]
train_predicts = train_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-Train-L{:4f}-A{:4f}.csv".format(val_loss, training_auc)
train_predicts.to_csv(submit_path, index=False)