# Toxic Comments Classification

## General Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np

import gensim

from nltk import pos_tag, word_tokenize

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

from nltk import WordNetLemmatizer

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Word2Vec/Glove/FastText Words Embeddings

In [4]:
embedding_dimension = 0 # set this in each method
max_features = 30000    # number of unique tokens that the tokenizer save
maxlen = 25             # maximum sequence length

## Loading Wprd2Vec Data
embeddings_type = "FAST_TEXT"

embeddings_index = {}

if embeddings_type == "GLOVE":
    embedding_dimension = 200
    glove_data = '../../data/glove.6B/glove.6B.200d.txt'
    f = open(glove_data)
    for line in f:
        values = line.split()
        word = values[0]
        value = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = value
    f.close()
elif embeddings_type == "WORD2VEC":
    embedding_dimension = 300
    google_word2vec = '../../data/GoogleNews-vectors-negative300.bin.gz'
    vocab_model = gensim.models.KeyedVectors.load_word2vec_format(google_word2vec, binary=True)
    embedding_matrix = vocab_model.self.vectors
    vocab_dict = {word: embedding_matrix[vocab_model.vocab[word].index] for word in vocab_model.vocab.keys()}
    embeddings_index = vocab_dict
elif embeddings_type == "FAST_TEXT":
    embedding_dimension = 300
    file = '../../data/crawl-300d-2M.vec'
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(file))

print('Loaded %s word vectors using %s.' % (len(embeddings_index), embeddings_type)

Loaded 2000000 word vectors.


## Lemmatization helper method

In [5]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
            
        else:
            yield word

## Generate Train/Test Sequences

In [None]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
train = train.sample(frac=1)

X_train_sentences = train["comment_text"].fillna("fillna").values
X_test_sentences = test["comment_text"].fillna("fillna").values

print("!!!!!!!!!!!!!!!!")
print(X_train_sentences[0])
print("???????????????")

X_train_sentences = np.asarray([" ".join(lemmatize_all(x)) for x in X_train_sentences])
X_test_sentences = np.asarray([" ".join(lemmatize_all(x)) for x in X_test_sentences])
print(X_train_sentences[0])
print("@@@@@@@@@@@@@@@")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].values


tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(X_train_sentences))

list_tokenized_train = tokenizer.texts_to_sequences(X_train_sentences)
list_tokenized_test = tokenizer.texts_to_sequences(X_test_sentences)

X_train_sequences = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test_sequences = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

!!!!!!!!!!!!!!!!
De jure, there's no legitimate government. After abu Mazen dismissed the Hamas government, he was the legitimate government, but only for the short period allowed by the PA constitution after which new elections were necessary. He ignored that provision. Both governments are illegitimate, although the Hamas government in Gaza was at least elected, while Abbas is basically a Washington/Tel Aviv appointee, FWIW.
???????????????


# Create embeddings matrix

In [16]:
word_index = tokenizer.word_index

## Generate embeddings matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

print(str(embedding_matrix.shape))

embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)


(210338, 300)


## Create the Model and Train

In [17]:

def get_model():
    inp = Input(shape=(maxlen, ))
    x = embedding_layer(inp)X_te
    x = Dropout(0.05)(x)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    x = Dropout(0.05)(x)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    x = Dropout(0.05)(x)
    x = Bidirectional(LSTM(100))(x)
#     x = Dense(100, activation="tanh")(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    optimizer = optimizers.Adagrad(lr=0.01, clipvalue=0.5)
    model.compile(loss='binary_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    return model


model = get_model()
batch_size = 32
epochs = 2


file_path="weights_base.best.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=2)
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)


callbacks_list = [checkpoint, early, TQDMNotebookCallback()]
model.fit(X_train_sequences, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list, verbose=0)

model.load_weights(file_path)

y_test = model.predict(X_test_sequences)



sample_submission = pd.read_csv("input/sample_submission.csv")

sample_submission[list_classes] = y_test



sample_submission.to_csv("baseline.csv", index=False)


Epoch 00001: val_loss improved from inf to 0.05446, saving model to weights_base.best.hdf5



Epoch 00002: val_loss improved from 0.05446 to 0.05223, saving model to weights_base.best.hdf5

