Credits:
1. [Keras - Bidirectional LSTM baseline ( lb 0.051)](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051/code)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train.head()

In [None]:
train['comment_text'] = train['comment_text'].apply(str)
test['comment_text'] = test['comment_text'].apply(str)

In [None]:
train_X = train["comment_text"].fillna("XX").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_Y = train[list_classes].values
test_X = test["comment_text"].fillna("XX").values

In [None]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [None]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Convert words to lower case and split them
    text = text.lower().split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        
    text = " ".join(text)
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [None]:
comments = []
for text in train_X:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in test_X:
    test_comments.append(text_to_wordlist(text))

In [None]:
maxlen = 120
max_features = 10000
token = Tokenizer(num_words=max_features)
token.fit_on_texts(comments + test_comments)
train_seq = token.texts_to_sequences(train_X)
test_seq = token.texts_to_sequences(test_X) 
train_X = sequence.pad_sequences(train_seq, maxlen=maxlen)
test_X = sequence.pad_sequences(test_seq, maxlen=maxlen)

In [None]:
def get_model():
    embed_size = 100
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_model()
batch_size = 32
epochs = 3
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] #early
model.fit(train_X, train_Y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)


In [None]:
model.load_weights(file_path)
preds = model.predict(test_X)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = preds

sample_submission.to_csv("starter_lstm.csv", index=False)