In [None]:
from tqdm import tqdm 
import re
import gc
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences, sequence
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate, Lambda, RepeatVector, Permute, multiply
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam, SGD
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from sklearn.model_selection import train_test_split, cross_val_score, KFold

#import os
#print(os.listdir("../input/fasttext-crawl-300d-2m"))


Constants

In [None]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'
EMBED_SIZE = 600
MAX_FEATURES = 100000
MAX_LEN = 256

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

Preprocessing the training data and testing data

In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
x_test = preprocess(test['comment_text'])
print(x_test.shape)

Tokenize the testing and training data

In [None]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

#Pad the sequences
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)

Logic to build the embedding matrix

Source: https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version

Modifications: Does not keep track of the unknown words

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            continue
    return embedding_matrix

glove_matrix = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
crawl_matrix = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)

del glove_matrix
del crawl_matrix
gc.collect()


LSTM and MLP Definition

Attention code source: https://github.com/keras-team/keras/issues/4962

In [None]:
units = 64
def LSTM(embedding_matrix):
    inp = Input(shape=(MAX_LEN,))
    x = Embedding(embedding_matrix.shape[0], EMBED_SIZE, weights=[embedding_matrix], trainable=False)(inp)
    
    #Spatial Dropout layer
    x = SpatialDropout1D(0.3)(x)
    
    #LSTM Layer
    x = CuDNNLSTM(units, return_sequences=True)(x)
    
    
    #attention layer
    attention = Dense(1, activation='tanh')(x)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(units)(attention)
    attention = Permute([2,1])(attention)
    
    merged = multiply([x, attention])
    #Pooling Layers
    average_pool = GlobalAveragePooling1D()(merged)
    max_pool = GlobalMaxPooling1D()(merged)
    
    #combine all the outputs to input into MLP
    concat_avg_max = concatenate([average_pool, max_pool])
    
    #MLP Definition
    concat_avg_max = Dense(256, activation='relu')(concat_avg_max)
    concat_avg_max = Dense(128, activation='relu')(concat_avg_max)
    concat_avg_max = Dense(16, activation='relu')(concat_avg_max)
    output = Dense(1, activation='sigmoid')(concat_avg_max)
    
    #Compiling the models together
    model=Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3, decay=0), metrics=['accuracy'])
    
    return model

Validation using Kfold

In [None]:
num_folds = 3
num_epochs = 4
folds = KFold(n_splits=num_folds, shuffle=True, random_state=53)
model = LSTM(embedding_matrix)
def train(x_train, y_train, x_test):
    prediction = np.zeros((len(x_test), 1))
    
    #KFold Validation
    for fold_index, (train_index, valid_index) in enumerate(folds.split(x_train, y_train)):
        x_train_split = x_train[train_index]
        y_train_split = y_train[train_index]
        x_validation = x_train[valid_index]
        y_validation = y_train[valid_index]
        
        model.fit(x_train_split, y_train_split, batch_size = 512, epochs = num_epochs, validation_data = (x_validation, y_validation))
        
        prediction += model.predict(x_test, batch_size = 512, verbose = 1)
        
    prediction /= fold_index
    
    return prediction
prediction = train(x_train, y_train, x_test)

In [None]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')
submission['prediction'] = prediction
submission.to_csv('submission.csv', index=False)