In [None]:
import os
os.environ['OMP_NUM_THREADS'] = '4'

import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Concatenate, Conv1D, Activation, TimeDistributed, Flatten, RepeatVector, Permute,multiply
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, Dropout, GRU, GlobalAveragePooling1D, MaxPooling1D, SpatialDropout1D, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [None]:
print('loading embeddings vectors')
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(' ')) for o in open('glove.840B.300d.txt',encoding="utf8"))

min_count = 10 #the minimum required word frequency in the text
max_features = 120000 #it's from previous run with min_count=10
maxlen = 180 #padding length
num_folds = 5 #number of folds
embed_size = 300 #embeddings dimension

#sia = SentimentIntensityAnalyzer()

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

list_sentences_train = train["comment_text"].fillna("").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("").values

print('mean text len:',train["comment_text"].str.count('\S+').mean())
print('max text len:',train["comment_text"].str.count('\S+').max())

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)) # + list(list_sentences_test)
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
print('padding sequences')
X_train = {}
X_test = {}
X_train['text'] = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post', truncating='post')
X_test['text'] = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post', truncating='post')
X_train['char'] = sequence.pad_sequences(list_tokenized_train, maxlen=max_len, padding='pre', truncating='pre')
X_test['char'] = sequence.pad_sequences(list_tokenized_test, maxlen=max_len, padding='pre', truncating='pre')

print('numerical variables')
train['num_words'] = train.comment_text.str.count('\S+')
test['num_words'] = test.comment_text.str.count('\S+')
train['num_comas'] = train.comment_text.str.count('\.')
test['num_comas'] = test.comment_text.str.count('\.')
train['num_bangs'] = train.comment_text.str.count('\!')
test['num_bangs'] = test.comment_text.str.count('\!')
train['num_quotas'] = train.comment_text.str.count('\"')
test['num_quotas'] = test.comment_text.str.count('\"')
train['avg_word'] = train.comment_text.str.len() / (1 + train.num_words)
test['avg_word'] = test.comment_text.str.len() / (1 + test.num_words)
#print('sentiment')
#train['sentiment'] = train.comment_text.apply(lambda s : sia.polarity_scores(s)['compound'])
#test['sentiment'] = test.comment_text.apply(lambda s : sia.polarity_scores(s)['compound'])
scaler = MinMaxScaler()
X_train['num_vars'] = scaler.fit_transform(train[['num_words','num_comas','num_bangs','num_quotas','avg_word']])
X_test['num_vars'] = scaler.transform(test[['num_words','num_comas','num_bangs','num_quotas','avg_word']])

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

print('create embedding matrix')
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
max_features_char = max_features
maxlen_char = maxlen
#tokenizer2 = Tokenizer(num_words=max_features)
#tokenizer2.fit_on_texts(list(list_sentences_train)) # + list(list_sentences_test)
#list_tokenized_train2 = tokenizer2.texts_to_sequences(list_sentences_train)
#list_tokenized_test2 = tokenizer2.texts_to_sequences(list_sentences_test)

In [None]:
embed_size_char = embed_size
batch_size = 32
epochs = 4
filter_sizes = [1,2,3,5]
num_filters = 32
filter_sizes_char = filter_sizes

In [None]:
def get_model_cnn(X_train):
    global embed_size, embed_size_char
    inp = Input(shape=(maxlen, ), name="text")
    inp2 = Input(shape=[X_train["char"].shape[1]], name="char")
    
    char_emb = Embedding(max_features_char, embed_size_char)(inp2)
    char_emb = SpatialDropout1D(0.5)(char_emb)
    
    conv_0_char = Conv1D(num_filters, kernel_size=filter_sizes_char[0], kernel_initializer='normal',
                                                                                    activation='elu')(char_emb)
    conv_1_char = Conv1D(num_filters, kernel_size=filter_sizes_char[1], kernel_initializer='normal',
                                                                                    activation='elu')(char_emb)
    conv_2_char = Conv1D(num_filters, kernel_size=filter_sizes_char[2], kernel_initializer='normal',
                                                                                    activation='elu')(char_emb)
    conv_3_char = Conv1D(num_filters, kernel_size=filter_sizes_char[3], kernel_initializer='normal',
                                                                                    activation='elu')(char_emb)
    maxpool_0_char = MaxPooling1D(pool_size=maxlen_char - filter_sizes_char[0] + 1)(conv_0_char)
    maxpool_1_char = MaxPooling1D(pool_size=maxlen_char - filter_sizes_char[1] + 1)(conv_1_char)
    maxpool_2_char = MaxPooling1D(pool_size=maxlen_char - filter_sizes_char[2] + 1)(conv_2_char)
    maxpool_3_char = MaxPooling1D(pool_size=maxlen_char - filter_sizes_char[3] + 1)(conv_3_char)
        
    z_char = Concatenate(axis=1)([maxpool_0_char, maxpool_1_char, maxpool_2_char, maxpool_3_char])   
    z_char = Flatten()(z_char)
    z_char = Dropout(0.1)(z_char)
    
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(0.4)(x) 
    
    conv_0 = Conv1D(num_filters, kernel_size=filter_sizes[0], kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv1D(num_filters, kernel_size=filter_sizes[1], kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv1D(num_filters, kernel_size=filter_sizes[2], kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv1D(num_filters, kernel_size=filter_sizes[3], kernel_initializer='normal',
                                                                                    activation='elu')(x)
    
    maxpool_0 = MaxPooling1D(pool_size=maxlen - filter_sizes[0] + 1)(conv_0)
    maxpool_1 = MaxPooling1D(pool_size=maxlen - filter_sizes[1] + 1)(conv_1)
    maxpool_2 = MaxPooling1D(pool_size=maxlen - filter_sizes[2] + 1)(conv_2)
    maxpool_3 = MaxPooling1D(pool_size=maxlen - filter_sizes[3] + 1)(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
    outp = Concatenate()([z,z_char])
    outp = Dropout(0.4)(outp)    
    outp = Dense(6, activation="sigmoid")(outp)
    
    model = Model(inputs=[inp,inp2], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model      

print('start modeling')
scores = []
predict = np.zeros((test.shape[0],6))
oof_predict = np.zeros((train.shape[0],6))

kf = KFold(n_splits=num_folds, shuffle=True, random_state=6666)
for train_index, test_index in kf.split(X_train['num_vars']):
    kfold_X_train = {}
    kfold_X_valid = {}
    y_train,y_test = y[train_index], y[test_index]
    for c in ['text','num_vars','char']:
        kfold_X_train[c] = X_train[c][train_index]
        kfold_X_valid[c] = X_train[c][test_index]

    model = get_model_cnn(X_train)
    model.fit(kfold_X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)
    predict += model.predict(X_test, batch_size=1000) / num_folds
    oof_predict[test_index] = model.predict(kfold_X_valid, batch_size=1000)
    cv_score = roc_auc_score(y_test, oof_predict[test_index])
    scores.append(cv_score)
    print('score: ',cv_score)

print('Total CV score is {}'.format(np.mean(scores)))