In [29]:
import numpy as np
import pandas as pd

import gzip
import os
import gc

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Activation, Bidirectional, InputLayer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [3]:
from gensim.models import Word2Vec, KeyedVectors



In [4]:
hyperparam = {'embedding_dim': 300, 
              'filters': 100, 
              'kernel_size': 3,
              'dropout' : 0.7,
              'n_class': 6,
              'conv_activation': 'relu', 
              'dense_activation':'relu',
              'batch_size': 128}

In [5]:
maxlen = 100

In [6]:
def get_wordvectors(words):
    try:
        embed = word_vec.wv[words]
    except ValueError:
        embed = None
    except KeyError:
        words_n = []
        for word in words:
            if word in word_vec.wv.vocab:
                words_n.append(word)
        if words_n:
            embed = word_vec.wv[words_n]
        else:
            embed = None
    return embed

In [7]:
def pad_zero(arr):
    pad_len = max(maxlen - arr.shape[0], 0)
    if pad_len > 0:
        arr = np.vstack((arr, np.zeros((pad_len, hyperparam['embedding_dim']))))
    return arr[:maxlen, :]

In [8]:
def pad_nan(arr):
    pad_len = max(maxlen - arr.shape[0], 0)
    if pad_len > 0:
        padding = np.empty((pad_len, hyperparam['embedding_dim']), dtype=np.float32)
        padding[:] = np.nan
        arr = np.vstack((arr, padding))
    return arr[:maxlen, :]

In [9]:
if os.path.exists('./data/GoogleNews-vectors-negative300.bin'):
    word_vec = KeyedVectors.load_word2vec_format(fname='./data/GoogleNews-vectors-negative300.bin', binary=True)
elif os.path.exists('./data/GoogleNews-vectors-negative300.bin.gz'):
    google_w2v = gzip.open('./data/GoogleNews-vectors-negative300.bin.gz', 'rb')
    word_vec = KeyedVectors.load_word2vec_format(fname=google_w2v, binary=True)
else:
    print('Embedings not found')

In [10]:
TokenVectorizer = CountVectorizer(stop_words=set(stopwords.words('english')), lowercase=True)

In [11]:
tokenizer = TokenVectorizer.build_analyzer()

In [12]:
train = pd.read_csv('./data/train.csv')

In [13]:
train['comment_tokens'] = train['comment_text'].apply(tokenizer)

In [14]:
train['comment_wordvec'] = train['comment_tokens'].apply(get_wordvectors) 

In [15]:
train = train.loc[~train['comment_wordvec'].isnull(), :]

In [16]:
train['wv_len'] = train['comment_wordvec'].apply(lambda x: x.shape[0])

In [17]:
train.index = np.arange(train.shape[0])

In [18]:
X_train = np.empty((train.shape[0], maxlen, hyperparam['embedding_dim']), dtype=np.float32)

In [19]:
for i, row in train.iterrows():
    X_train[i, :, :] = pad_zero(row['comment_wordvec'])

In [20]:
# train['padded_wv'] = train['comment_wordvec'].apply(pad_zero)

In [21]:
# X_train  = np.stack(train['padded_wv'])

In [22]:
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [23]:
X_train.shape

(95731, 100, 300)

In [36]:
def computation_graph():
    model  = Sequential()
    model.add(InputLayer((100, hyperparam['embedding_dim'])))
    model.add(Bidirectional(LSTM(units=100)))
    model.add(Dropout(rate=hyperparam['dropout']))
#     model.add(Dense(units=hyperparam['filters']))
#     model.add(Dropout(rate=hyperparam['dropout']))
#     model.add(Activation(hyperparam['dense_activation']))
    model.add(Dense(units=hyperparam['n_class'], activation='sigmoid'))
    return model

In [37]:
model = computation_graph()

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100, 300)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1206      
Total params: 322,006
Trainable params: 322,006
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0005), metrics=['accuracy'])

In [40]:
history = model.fit(x=X_train, 
          y=y_train, 
          epochs=10,
          batch_size=hyperparam['batch_size'], 
          shuffle=True, 
          validation_split=0.1, 
          callbacks=[EarlyStopping(verbose=1)])

Train on 86157 samples, validate on 9574 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 00008: early stopping


In [41]:
model.save('./models/Baseline_LSTM_es.h5')