In [18]:
from __future__ import print_function
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras import regularizers

from keras.callbacks import EarlyStopping, ModelCheckpoint

# Data

In [2]:
path = '../input/'
EMBEDDING_FILE = path + 'glove.6B/glove.6B.300d.txt'
TRAIN_DATA_FILE = path + 'train.csv'
TEST_DATA_FILE = path + 'test.csv'

In [3]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train_df["comment_text"].fillna("_na_").values

class_list = ["toxic", "severe_toxic", "obscene", 
              "threat", "insult", "identity_hate"]
labels = train_df[class_list].values

list_sentences_test = test_df["comment_text"].fillna("_na_").values

In [4]:
# Embedding
max_features = 20000
maxlen = 100
embedding_size = 300

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

features_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
features_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

# Model

In [35]:
# Convolution
kernel_size = 5
filters = 128
pool_size = 4

# LSTM
lstm_output_size = 100

# Training
batch_size = 32
epochs = 5

'''
Note:
batch_size is highly sensitive.
Only 2 epochs are needed as the dataset is very small.
'''

'\nNote:\nbatch_size is highly sensitive.\nOnly 2 epochs are needed as the dataset is very small.\n'

In [39]:
print('Build model...')

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dropout(0.2))
model.add(Dense(12, activity_regularizer=regularizers.l1_l2(0.00004)))
#model.add(Dropout(0.2))
model.add(Dense(6, activity_regularizer=regularizers.l1_l2(0.00004)))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


In [42]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 300)          6000000   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 96, 128)           192128    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 24, 128)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               91600     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 12)                1212      
_________________________________________________________________
dense_10 (Dense)             (None, 6)                 78        
__________

# Training

In [40]:
STAMP = 'cnn_lstm_Glove_0228'
early_stopping =EarlyStopping(monitor='val_acc', patience=5)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [41]:
print('Train...')
hist = model.fit(features_train, labels,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.05,
          callbacks=[early_stopping, model_checkpoint])

Train...
Train on 151592 samples, validate on 7979 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Submission

In [None]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_acc'])
print("Model val_acc", bst_val_score)

In [32]:
y_test = model.predict([features_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(path+'sample_submission.csv')


sample_submission[class_list] = y_test
sample_submission.to_csv('../output/4_cnn_lstm_glove0228_300.csv', index=False)

