In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


In [None]:
# Getting Started

You'll need the following: 

1. fasttext embedding file. Download [here](https://fasttext.cc/docs/en/english-vectors.html)
2. Kaggle's toxic comment data. Download [here] (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
3. Change directories to point to correct csv and embedding locations

In [2]:
EMBEDDING_FILE = '/PATH/TO/crawl-300d-2M.vec'
train = pd.read_csv('/PATH/TO/train.csv')

In [3]:
xtrain = train["comment_text"].fillna("fillna")
ytrain = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [4]:
max_features = 30000
maxlen = 100
embed_size = 300

In [5]:
# split each word in each comment 
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(xtrain))
# gets sequential representation of each word
xtrain = tokenizer.texts_to_sequences(xtrain)
# fill out each comment text to the maxlen of words 
xtrain = sequence.pad_sequences(xtrain, maxlen=maxlen)

In [6]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [7]:
# creates dictionary to extract word vectors from
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))


word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
# matrix to store embeding vectors
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [8]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [9]:
model = get_model()

In [10]:
batch_size = 32
epochs = 2

In [11]:
x_tra, x_val, y_tra, y_val = train_test_split(xtrain, ytrain, train_size=0.95, random_state=233)



In [12]:
hist = model.fit(x_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 - 1037s - loss: 0.0500 - acc: 0.9820 - val_loss: 0.0455 - val_acc: 0.9825
Epoch 2/2
 - 1053s - loss: 0.0380 - acc: 0.9853 - val_loss: 0.0445 - val_acc: 0.9829


In [13]:
model.save_weights('/PATH/TO/SAVE/toxic_comment_weights.h5')

In [14]:
model.save('/PATH/TO/SAVE/model.h5')