# Loading

In [30]:
import sys, os, re, csv, codecs
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras import initializers, regularizers, constraints, optimizers, layers
from keras import regularizers

In [2]:
path = '../input/'
EMBEDDING_FILE = path + 'glove.6B/glove.6B.300d.txt'
TRAIN_DATA_FILE = path + 'train.csv'
TEST_DATA_FILE = path + 'test.csv'

In [3]:
EXTRAIN_DATA_FILE = path
EXTRA_DAT = False

In [4]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

if EXTRA_DAT:
    extra_df = pd.read_csv(EXTRAIN_DATA_FILE)
    
list_sentences_train = train_df["comment_text"].fillna("_na_").values

class_list = ["toxic", "severe_toxic", "obscene", 
              "threat", "insult", "identity_hate"]
labels = train_df[class_list].values

list_sentences_test = test_df["comment_text"].fillna("_na_").values

# Preprocessing

In [5]:
### basic config param
embed_size = 300
max_features = 20000
maxlen = 200

### Generate sentence feature

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

features_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
features_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
features_train.shape

(159571, 200)

### Read in Glove

In [9]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(
    get_coefs(*o.strip().split()
             ) for o in open(EMBEDDING_FILE, encoding="utf-8"))

Create embedding matrix, with random initialization for words not in glove

In [11]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [17]:
# generate random number matrix as place holder
word_index = tokenizer.word_index
nb_words = max(max_features, len(word_index))
#embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
embedding_matrix = np.zeros((max_features, embed_size))

# insert glove word vectors into the embedding matrix accoding to word index
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [19]:
embedding_matrix.shape

(20000, 300)

# Model

In [31]:
def get_model(n_units):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    #x = Embedding(nb_words, embed_size)(inp)
    #x = Bidirectional(LSTM(n_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Bidirectional(LSTM(n_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dropout(0.25)(x)
    x = Dense(50, activation="relu", activity_regularizer=regularizers.l1_l2(0.00002))(conc)
    x = Dropout(0.2)(x)
    x = Dense(6, activation="sigmoid", activity_regularizer=regularizers.l1_l2(0.00002))(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())
    return model

In [32]:
model = get_model(100)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 200)           0                                            
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 200, 300)      6000000     input_5[0][0]                    
____________________________________________________________________________________________________
bidirectional_4 (Bidirectional)  (None, 200, 200)      320800      embedding_5[0][0]                
____________________________________________________________________________________________________
global_average_pooling1d_4 (Glob (None, 200)           0           bidirectional_4[0][0]            
___________________________________________________________________________________________

# Training

keras KeyError: do NOT use dataframe as input, but use np array

In [33]:
STAMP = 'pool_lstm_Glove_0301'
early_stopping =EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [34]:
batch_size = 32
epochs = 4

In [None]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [35]:
hist = model.fit(features_train, labels, 
          batch_size=batch_size, epochs=epochs, 
          validation_split=0.1,
          callbacks=[early_stopping, model_checkpoint])

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
 28512/143613 [====>.........................] - ETA: 1889s - loss: 0.0931 - acc: 0.9679

KeyboardInterrupt: 

# Generate submission

In [21]:
model.load_weights(bst_model_path)
#bst_val_score = min(hist.history['val_loss'])

In [23]:
y_test = model.predict([features_test], batch_size=1024, verbose=1)

ResourceExhaustedError: OOM when allocating tensor with shape[1024,400,300]
	 [[Node: bidirectional_1/Tile_10 = Tile[T=DT_FLOAT, Tmultiples=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](bidirectional_1/ExpandDims_8, bidirectional_1/stack_12)]]
	 [[Node: dense_2/Sigmoid/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_1430_dense_2/Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'bidirectional_1/Tile_10', defined at:
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\IPython\core\interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-5da41a563900>", line 1, in <module>
    model = get_model(100)
  File "<ipython-input-15-b0a0be2c1baf>", line 5, in get_model
    x = Bidirectional(LSTM(n_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\engine\topology.py", line 596, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\layers\wrappers.py", line 280, in call
    y_rev = self.backward_layer.call(inputs, **kwargs)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\layers\recurrent.py", line 333, in call
    preprocessed_input = self.preprocess_input(inputs, training=None)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\layers\recurrent.py", line 1083, in preprocess_input
    timesteps, training=training)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\layers\recurrent.py", line 45, in _time_distributed_dense
    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\keras\backend\tensorflow_backend.py", line 1867, in repeat
    return tf.tile(x, pattern)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 3677, in tile
    name=name)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\wszjz\AppData\Local\conda\conda\envs\tfgpu\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1024,400,300]
	 [[Node: bidirectional_1/Tile_10 = Tile[T=DT_FLOAT, Tmultiples=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](bidirectional_1/ExpandDims_8, bidirectional_1/stack_12)]]
	 [[Node: dense_2/Sigmoid/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_1430_dense_2/Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


In [24]:
y_test = model.predict(features_test)

In [25]:
sample_submission = pd.read_csv(path+'sample_submission.csv')

In [26]:
sample_submission[class_list] = y_test
sample_submission.to_csv('../output/J1_lstm_glove0226_100.csv', index=False)