In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten,Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras import regularizers

from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load

In [2]:
path = '../input/'
EMBEDDING_FILE = path + 'glove.6B/glove.6B.300d.txt'
TRAIN_DATA_FILE = path + 'train.csv'
TEST_DATA_FILE = path + 'test.csv'

In [3]:
EXTRAIN_DATA_FILE = path
EXTRA_DAT = False

In [4]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

if EXTRA_DAT:
    extra_df = pd.read_csv(EXTRAIN_DATA_FILE)
    
list_sentences_train = train_df["comment_text"].fillna("_na_").values

class_list = ["toxic", "severe_toxic", "obscene", 
              "threat", "insult", "identity_hate"]
labels = train_df[class_list].values

list_sentences_test = test_df["comment_text"].fillna("_na_").values

# Embedding

In [11]:
### basic config param
embed_size = 300
max_features = 20000
maxlen = 800

In [12]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

features_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
features_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [13]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(
    get_coefs(*o.strip().split()
             ) for o in open(EMBEDDING_FILE, encoding="utf-8"))

In [14]:
# generate random number matrix as place holder
word_index = tokenizer.word_index
nb_words = max(max_features, len(word_index))
#embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
embedding_matrix = np.zeros((max_features, embed_size))

# insert glove word vectors into the embedding matrix accoding to word index
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Model

In [22]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(maxlen,), dtype='int32')
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(max_features,
                            embed_size,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(512, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
#x = Dropout(0.8)(x)
x = Dense(128, activation='relu')(x)
#x = Dropout(0.8)(x)
preds = Dense(6, activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
print(model.summary())

Training model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 800)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 800, 300)          6000000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 796, 512)          768512    
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 159, 512)          0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 155, 256)          655616    
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 31, 256)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 27, 128)           16396

# Training

In [23]:
batch_size = 32
epochs = 4

In [None]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(features_train)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.00005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [24]:
model.fit(features_train, labels,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x17fd51c6e10>