# Loading

In [1]:
import sys, os, re, csv, codecs
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras import initializers, regularizers, constraints, optimizers, layers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = '../input/'
EMBEDDING_FILE = path + 'glove.6B/glove.6B.300d.txt'
TRAIN_DATA_FILE = path + 'train.csv'
TEST_DATA_FILE = path + 'test.csv'

In [3]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train_df["comment_text"].fillna("_na_").values

class_list = ["toxic", "severe_toxic", "obscene", 
              "threat", "insult", "identity_hate"]
labels = train_df[class_list].values

list_sentences_test = test_df["comment_text"].fillna("_na_").values

# Preprocessing

In [4]:
### basic config param
embed_size = 300
max_features = 20000
maxlen = 100

### Generate sentence feature

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

features_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
features_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [6]:
features_train.shape

(159571, 100)

### Read in Glove

In [7]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [8]:
embeddings_index = dict(
    get_coefs(*o.strip().split()
             ) for o in open(EMBEDDING_FILE, encoding="utf-8"))

Create embedding matrix, with random initialization for words not in glove

In [9]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [10]:
# generate random number matrix as place holder
word_index = tokenizer.word_index
nb_words = max(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, 
                                    (max_features, embed_size))

In [11]:
# insert glove word vectors into the embedding matrix accoding to word index
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
embedding_matrix.shape

(20000, 300)

# Model

In [13]:
def get_model(n_units):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    #x = Embedding(nb_words, embed_size)(inp)
    x = Bidirectional(LSTM(n_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dense(50, activation="relu")(conc)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())
    return model

In [14]:
model = get_model(100)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 100, 300)      6000000     input_1[0][0]                    
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 100, 200)      320800      embedding_1[0][0]                
____________________________________________________________________________________________________
global_average_pooling1d_1 (Glob (None, 200)           0           bidirectional_1[0][0]            
___________________________________________________________________________________________

# Training

keras KeyError: do NOT use dataframe as input, but use np array

In [15]:
STAMP = 'pool_lstm_Glove_0228'
early_stopping =EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [16]:
batch_size = 32
epochs = 4

In [17]:
hist = model.fit(features_train, labels, 
          batch_size=batch_size, epochs=epochs, 
          validation_split=0.1,
          callbacks=[early_stopping, model_checkpoint])

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# Generate submission

In [18]:
model.load_weights(bst_model_path)
#bst_val_score = min(hist.history['val_loss'])
#print("Model val_score", bst_val_score)

In [20]:
y_test = model.predict([features_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(path+'sample_submission.csv')


sample_submission[class_list] = y_test
sample_submission.to_csv('../output/4_pool_lstm_glove0302_300.csv', index=False)

