In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '1'
os.environ['OMP_NUM_THREADS'] = '64'

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
test_labels = pd.read_csv('./test_labels.csv')
subm = pd.read_csv('./sample_submission.csv')

In [None]:
import re
import swifter
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.tokenize import TweetTokenizer, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from ekphrasis.classes.segmenter import Segmenter
from textblob import Word
from ekphrasis.classes.spellcorrect import SpellCorrector


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def lemmatize_sentence(sentence):
    res = []
    lemmatizer = WordNetLemmatizer()
    tknzr = TweetTokenizer()
    for word, pos in pos_tag(tknzr.tokenize(sentence)):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    return res


sp = SpellCorrector(corpus="english") 
seg_tw = Segmenter(corpus="twitter")
cnt = 0
all_words = {}
def clean_text(text):
    global cnt
    cnt += 1
#     if cnt >= 5005 and cnt <= 5008:
#         print(text)
#     else:
#         return ' '
#     print(text)
#     print(cnt)
    if cnt % 1000 == 0:
        print(cnt)
#     x_ascii = unidecode(x)
#     x_clean = special_character_removal.sub('',x_ascii)
#                 '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    text = text.lower()
    text = re.sub('["#$%&()*+,-./:;<=>@[\\]^_`{|}~\\t\\n]', ' ', text)
    text = text.replace("’", "'")
    
    p = ["'ll",   "can't",   "n't",  "'m",  "'s", "'ve",   "'re",  "'d"]
    q = [" will", "can not", " not", " am", "",   " have", " are", "would"]
    for i in range(len(p)):
        text = text.replace(p[i], q[i])
        
    words = lemmatize_sentence(text)

    segment_words = []
    for x in words:
        if len(x) > 50:
            segment_words.append(x)
            continue
        if x in all_words:
            segment_words.extend(all_words[x])
        else:
            if x in embeddings_index_ft:
                segment_words.append(x)
                all_words[x] = [x]
            else:
                x = sp.correct(x)
                if x in embeddings_index_ft:
                    segment_words.append(x)
                    all_words[x] = [x]
                else:
                    try:
                        seg_word = seg_tw.segment(x)
                        seg_word = seg_word.split(' ')
                        segment_words.extend(seg_word)
                        all_words[x] = seg_word
                    except:
                        segment_words.append(x)
                        all_words[x] = [x]
    clean_text = ' '.join(segment_words)
    return clean_text

train['clean_text'] = train['comment_text'].swifter.set_npartitions(64).apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].swifter.set_npartitions(64).apply(lambda x: clean_text(str(x)))

train['clean_text'] = train['clean_text'].fillna('something')
test['clean_text'] = test['clean_text'].fillna('something')

In [None]:
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob


def add_features(df):
    
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    
    df['polarity'] = df['comment_text'].swifter.set_npartitions(64).apply(lambda x:TextBlob(str(x)).sentiment.polarity)
    df['subjectivity'] = df['comment_text'].swifter.set_npartitions(64).apply(lambda x:TextBlob(str(x)).sentiment.subjectivity)

    return df

train = add_features(train)
test = add_features(test)

feature_name = ['caps_vs_length', 'words_vs_unique', 'polarity', 'subjectivity']
features = train[feature_name].fillna(0)
test_features = test[feature_name].fillna(0)

ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

In [None]:
train.to_csv('./train_100.csv')
test.to_csv('./test_100.csv')

In [None]:
x_train = train['clean_text']
x_test = test['clean_text']
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = train[label_names].values
y_test = test_labels[label_names].values

In [None]:
# import tensorflow as tf
# import numpy as np

# reader = tf.train.NewCheckpointReader('./multi_cased_L-12_H-768_A-12/bert_model.ckpt')
# tensor = reader.get_tensor('bert/embeddings/word_embeddings')
# print(type(tensor), tensor.shape)

In [None]:
# f  = open('./multi_cased_L-12_H-768_A-12/vocab.txt')
# embeddings_index = {}
# for i, line in enumerate(f):
#     line = line.strip('\n')
#     embeddings_index[line] = tensor[i]

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

max_features = 300000
tokenizer = Tokenizer(num_words=max_features, filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
from keras.preprocessing import sequence
maxlen = 1000
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
# sub_test = test['comment_text']
# sub_test = tokenizer.texts_to_sequences(sub_test)
# sub_test = sequence.pad_sequences(sub_test, maxlen=maxlen)

In [None]:
# Load the FastText Web Crawl vectors
EMBEDDING_FILE_FASTTEXT="./crawl-300d-2M.vec"
EMBEDDING_FILE_TWITTER="./glove.twitter.27B.200d.txt"
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_ft = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))
embeddings_index_tw = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE_TWITTER,encoding='utf-8'))

In [None]:
import gensim
spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)

In [None]:
words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

def words(text): return re.findall(r'\w+', text.lower())

def P(word):
    return - WORDS.get(word, 0)

def correction(word):
    return max(candidates(word), key=P)

def candidates(word):
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    return set(w for w in words if w in WORDS)

def edits1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

In [None]:
from textblob import Word

# seg_tw = Segmenter(corpus="twitter")
embedding_size = 501

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index)) + 1
embedding_matrix = np.zeros((num_words,501))

something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((501,))
something[:300,] = something_ft
something[300:500,] = something_tw
something[500,] = 0

def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,500] = last_value
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw


for word, i in word_index.items():
    
    if i >= max_features: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        if len(word) > 20:
            embedding_matrix[i] = something 
        else:
            word2 = correction(word)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import log_loss
import keras
from sklearn.metrics import roc_auc_score


tag = ''
class LossHistory(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        global tag
        print(epoch)
        epoch = str(epoch)
        preds = self.model.predict([x_test, test_features], batch_size=64, verbose=1)
        print(preds.shape)
        
        submid = pd.DataFrame({'id': subm['id']})
        submission = pd.concat([submid, pd.DataFrame(preds, columns=label_names)], axis=1)
        submission.to_csv(tag + '/submission' + tag + '_' + epoch + '.csv', index=False)

        sub = pd.read_csv(tag + '/submission' + tag + '_' + epoch + '.csv')
        avg_score = 0.0
        for i, label_name in enumerate(label_names):
            y_last = test_labels[label_name].values

            idx = y_last != -1
            score = mean_squared_log_error(y_last[idx], sub[label_name][idx])
            avg_score += score
            print(label_name + ': ' + str(score))
        print(avg_score/len(label_names))
        print(roc_auc_score(test_labels[label_names][idx], sub[label_names][idx]))

In [None]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.layers import Embedding
from keras.layers import LSTM, Bidirectional, GRU, SpatialDropout1D, CuDNNLSTM, CuDNNGRU, concatenate
from keras.layers import Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.initializers import Constant
from keras.models import Model
from keras import optimizers

tag = '101'
try:
    os.mkdir(tag)
except:
    pass

history = LossHistory()
checkpoint = ModelCheckpoint(tag + '/{epoch:02d}-{val_loss:.4f}.model', monitor='val_acc', verbose=1, period=1)
callbacks_list = [history]
# Embedding
# max_features = 20000
# maxlen = 300
# embedding_size = 512

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
epochs = 50

'''
Note:
batch_size is highly sensitive.
Only 2 epochs are needed as the dataset is very small.
'''

# # print('Loading data...')
# # (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
# print(len(X_train), 'train sequences')
# # print(len(x_test), 'test sequences')

# print('Pad sequences (samples x time)')
# X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
# # x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
# print('x_train shape:', X_train.shape)
# # print('x_test shape:', x_test.shape)

print('Build model...')

l0 = Input(shape=(maxlen,))
features_input = Input(shape=(features.shape[1],))
# l1 = Embedding(num_words,
#                     embedding_size,
#                     input_length=maxlen)(l0)
l1 = Embedding(num_words,
                    embedding_size,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=maxlen,
                    trainable=False)(l0)
l2 = SpatialDropout1D(0.5)(l1)
l3 = Bidirectional(CuDNNLSTM(40, return_sequences=True))(l2)
l4, l4_h, l4_c = Bidirectional(CuDNNGRU(40, return_sequences=True, return_state = True))(l3)
# model.add(Dropout(0.5))
la = GlobalAveragePooling1D()(l4)
lm = GlobalMaxPooling1D()(l4)
l5 = concatenate([la, l4_h, lm, features_input])
l6 = Dense(6, activation='sigmoid')(l5)

# try using different optimizers and different optimizer configs
print('Train...')

model = Model([l0, features_input], l6)
adam = optimizers.adam(clipvalue=1)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy', 'mean_squared_logarithmic_error'])
model.fit([x_train, features], y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1,
#           validation_data=([x_test, test_features[idx]], y_test),
          callbacks=callbacks_list)
# score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
# print('Test score:', score)
# print('Test accuracy:', acc)
