# Toxic Comments Classification

## General Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=FutureWarning)
    import h5py
warnings.filterwarnings('ignore')

import gensim

from nltk import pos_tag, word_tokenize

from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras import optimizers
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras_tqdm import TQDMNotebookCallback

from nltk import WordNetLemmatizer



import os
os.environ['OMP_NUM_THREADS'] = '4'

embedding_dimension = 0 # set this in each method
max_features = 30000    # number of unique tokens that the tokenizer save
maxlen = 100             # maximum sequence length


Using TensorFlow backend.


## Load Word2Vec/Glove/FastText Words Embeddings

In [2]:
## Loading Wprd2Vec Data
embeddings_type = "FAST_TEXT1"

embeddings_index = {}

if embeddings_type == "GLOVE":
    embedding_dimension = 200
    glove_data = '../../data/glove.6B/glove.6B.200d.txt'
    f = open(glove_data)
    for line in f:
        values = line.split()
        word = values[0]
        value = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = value
    f.close()
elif embeddings_type == "WORD2VEC":
    embedding_dimension = 300
    google_word2vec = '../../data/GoogleNews-vectors-negative300.bin.gz'
    vocab_model = gensim.models.KeyedVectors.load_word2vec_format(google_word2vec, binary=True)
    embedding_matrix = vocab_model.self.vectors
    vocab_dict = {word: embedding_matrix[vocab_model.vocab[word].index] for word in vocab_model.vocab.keys()}
    embeddings_index = vocab_dict
elif embeddings_type == "FAST_TEXT1":
    embedding_dimension = 300
    file = '../../data/crawl-300d-2M.vec'
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(file))
elif embeddings_type == "FAST_TEXT2":
    embedding_dimension = 300
    file = '../../data/wiki.en.vec'
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(file))
    
print('Loaded %s word vectors using %s.' % (len(embeddings_index), embeddings_type))

Loaded 2000000 word vectors using FAST_TEXT1.


## Generate Train/Test Sequences

In [3]:
train = pd.read_csv("input/train_preprocessed_all.csv")
test = pd.read_csv("input/test_preprocessed_all.csv")
train = train.sample(frac=1)

X_train_sentences = train["comment_text"].fillna("fillna").values
X_test_sentences = test["comment_text"].fillna("fillna").values


list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].values


tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(X_train_sentences))

list_tokenized_train = tokenizer.texts_to_sequences(X_train_sentences)
list_tokenized_test = tokenizer.texts_to_sequences(X_test_sentences)

X_train_sequences = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test_sequences = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

# Create embeddings matrix

In [4]:
word_index = tokenizer.word_index

## Generate embeddings matrix
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print('mean: {}, std: {}'.format(str(emb_mean), str(emb_std)))

# initialize embeddings with zeros
# embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))

# initialize embeddings with the mean and std of all embeddeings
embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index) + 1, embedding_dimension))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros or mean/std initialized
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

print(str(embedding_matrix.shape))

embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)


mean: 0.0055286596, std: 0.34703913
(166930, 300)


## Create the Model and Train

In [5]:

def get_model():
    dropput_rate = 0.2
    inp = Input(shape=(maxlen, ))
    x = embedding_layer(inp)
    x = Bidirectional(LSTM(100, return_sequences=True, dropout=dropput_rate))(x)
    x = Dense(100, activation="relu")(x)
    x = Bidirectional(LSTM(100, return_sequences=True, dropout=dropput_rate))(x)
    x = Dense(100, activation="relu")(x)
    x = Bidirectional(LSTM(100, return_sequences=True, dropout=dropput_rate))(x)
    x = Dense(100, activation="relu")(x)
    x = LSTM(100, dropout=dropput_rate)(x)
    x = Dense(100, activation="relu")(x)
    x = Dropout(dropput_rate)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    optmzr = optimizers.Adagrad(lr=0.01)
    model.compile(loss='binary_crossentropy',
                  optimizer=optmzr,
                  metrics=['accuracy'])
    return model


model = get_model()
batch_size = 32
epochs = 10


file_path="weights_base.best.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=2)
tensorboard = TensorBoard(log_dir='logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)


callbacks_list = [checkpoint, early, TQDMNotebookCallback(), tensorboard]
model.fit(X_train_sequences, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list, verbose=0)

y_test = model.predict(X_test_sequences)


sample_submission = pd.read_csv("input/sample_submission.csv")

sample_submission[list_classes] = y_test

sample_submission.to_csv("baseline.csv", index=False)


Epoch 00001: val_loss improved from inf to 0.05016, saving model to weights_base.best.hdf5



Epoch 00002: val_loss improved from 0.05016 to 0.04865, saving model to weights_base.best.hdf5



Epoch 00003: val_loss improved from 0.04865 to 0.04814, saving model to weights_base.best.hdf5



Epoch 00004: val_loss improved from 0.04814 to 0.04725, saving model to weights_base.best.hdf5



Epoch 00005: val_loss improved from 0.04725 to 0.04722, saving model to weights_base.best.hdf5



Epoch 00006: val_loss improved from 0.04722 to 0.04698, saving model to weights_base.best.hdf5



Epoch 00007: val_loss improved from 0.04698 to 0.04650, saving model to weights_base.best.hdf5



Epoch 00008: val_loss did not improve



Epoch 00009: val_loss improved from 0.04650 to 0.04601, saving model to weights_base.best.hdf5



Epoch 00010: val_loss did not improve

