The novel part is the use of 7 300d embeddings together with pymagnitude, which calculate embeddings that are not in the dictionary with its nearest neighbours. 

The output can be found at https://www.kaggle.com/huikang/ndsc900

This tokeniser adapted from a kernel from the recent Quora competition with the following preface:

### Preface

Hello. This is basically cutting and pasting from the amazing kernels of this competition. Please notify me if I don't attribute something correctly.

* https://www.kaggle.com/gmhost/gru-capsule
* How to: Preprocessing when using embeddings
https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings
* Improve your Score with some Text Preprocessing https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing
* Simple attention layer taken from https://github.com/mttk/rnn-classifier/blob/master/model.py
* https://www.kaggle.com/ziliwang/baseline-pytorch-bilstm
* https://www.kaggle.com/hengzheng/pytorch-starter

## IMPORTS 

In [1]:
import time
start_time_ = time.time()
import random
import pandas as pd
import numpy as np
import gc
import re
# import torch
# from torchtext import data
# import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
# from textblob import TextBlob
from nltk import word_tokenize

from sklearn.metrics import f1_score
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from unidecode import unidecode

Using TensorFlow backend.


### Basic Parameters

In [2]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits

SEED = 1029

### Ensure determinism in the results

A common headache in this competition is the lack of determinism in the results due to cudnn. The following Kernel has a solution in Pytorch.

See https://www.kaggle.com/hengzheng/pytorch-starter. 

In [3]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
seed_everything()

### Code for Loading Embeddings

Functions taken from the kernel:https://www.kaggle.com/gmhost/gru-capsule


In [4]:
from pymagnitude import Magnitude

vectors1 = Magnitude("../input/magnitudes/wiki-news-300d-1M.magnitude")
vectors2 = Magnitude("../input/magnitudes/GoogleNews-vectors-negative300.magnitude")
vectors3 = Magnitude("../input/magnitudes/cc.ms.300.magnitude")

# the other set of four embeddings:
# vectors1 = Magnitude("../MUSE/wiki.multi.en.magnitude")
# vectors2 = Magnitude("../MUSE/wiki.multi.id.magnitude")
# vectors3 = Magnitude("../input/magnitudes/glove.840B.300d.magnitude")
# vectors4 = Magnitude("../input/magnitudes/crawl-300d-2M.magnitude")

vectors = Magnitude(vectors1,vectors2,vectors3)

## FUNCTIONS TAKEN FROM https://www.kaggle.com/gmhost/gru-capsule

def load_pymag(word_index):
    
    emb_mean,emb_std = -0., 0.48782197
    embed_size = 900

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix_ = np.random.normal(emb_mean, emb_std, (nb_words + 1, embed_size))
    print(np.shape(embedding_matrix_))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_matrix_[i] = vectors.query(word)
            
    return embedding_matrix_

## LOAD PROCESSED TRAINING DATA FROM DISK

In [None]:
df_train = pd.read_csv("../input/ndsc-beginner/train.csv")
df_test = pd.read_csv("../input/ndsc-beginner/test.csv")
df = pd.concat([df_train, df_test],sort=True)

In [None]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
vocab = build_vocab(df['title'])

In [None]:
sin = len(df_train[df_train["Category"]==0])
insin = len(df_train[df_train["Category"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# Sincere questions: {:,}({:.2f}%) and # Insincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f} of train samples)".format(len(df_test),len(df_test)/len(df_train)))

## Normalization

Borrowed from:
* How to: Preprocessing when using embeddings
https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings
* Improve your Score with some Text Preprocessing https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

In [None]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, ' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print("Added {count} words to embedding")    

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, ' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", x).strip()
#     x = re.sub('[0-9]{5,}', '#####', x)
#     x = re.sub('[0-9]{4}', '####', x)
#     x = re.sub('[0-9]{3}', '###', x)
#     x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

Extra feature part taken from https://github.com/wongchunghang/toxic-comment-challenge-lstm/blob/master/toxic_comment_9872_model.ipynb

In [None]:
from sklearn.preprocessing import StandardScaler


def add_features(df):
    
    df['title'] = df['title'].progress_apply(lambda x:str(x))
    df['total_length'] = df['title'].progress_apply(len)
    df['capitals'] = df['title'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.title.str.count('\S+')
    df['num_unique_words'] = df['title'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

def load_and_prec():
    train_df = pd.read_csv("../input/ndsc-beginner/train.csv")
    test_df = pd.read_csv("../input/ndsc-beginner/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    # lower
    train_df["title"] = train_df["title"].progress_apply(lambda x: x.lower())
    test_df["title"] = test_df["title"].apply(lambda x: x.lower())

    # Clean the text
    train_df["title"] = train_df["title"].progress_apply(lambda x: clean_text(x))
    test_df["title"] = test_df["title"].apply(lambda x: clean_text(x))
    
    # Clean numbers
    train_df["title"] = train_df["title"].progress_apply(lambda x: clean_numbers(x))
    test_df["title"] = test_df["title"].apply(lambda x: clean_numbers(x))
    
    # Clean spellings
    train_df["title"] = train_df["title"].progress_apply(lambda x: replace_typical_misspell(x))
    test_df["title"] = test_df["title"].apply(lambda x: replace_typical_misspell(x))
    
    ## fill up the missing values
    train_X = train_df["title"].fillna("_##_").values
    test_X = test_df["title"].fillna("_##_").values


    
    ###################### Add Features ###############################
    #  https://github.com/wongchunghang/toxic-comment-challenge-lstm/blob/master/toxic_comment_9872_model.ipynb
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
    test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)
    ###########################################################################

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    special_request = train_X
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the Category values
    train_y = train_df['Category'].values
    
    return train_X, test_X, train_y, features, test_features, tokenizer.word_index, special_request

In [None]:
x_train, x_test, y_train, features, test_features, word_index, special_request = load_and_prec() 

### SAVE DATASET TO DISK

In [None]:
!mkdir ndsc-loaded

In [None]:
np.save("ndsc-loaded/special_request.npy",special_request)
np.save("ndsc-loaded/x_train.npy",x_train)
np.save("ndsc-loaded/x_test.npy",x_test)
np.save("ndsc-loaded/y_train.npy",y_train)

np.save("ndsc-loaded/features.npy",features)
np.save("ndsc-loaded/test_features.npy",test_features)
np.save("ndsc-loaded/word_index.npy",word_index)

### LOAD DATASET FROM DISK

In [5]:
x_train = np.load("ndsc-loaded/x_train.npy")
x_test = np.load("ndsc-loaded/x_test.npy")
y_train = np.load("ndsc-loaded/y_train.npy")
features = np.load("ndsc-loaded/features.npy")
test_features = np.load("ndsc-loaded/test_features.npy")
word_index = np.load("ndsc-loaded/word_index.npy").item()

In [None]:
word_index

### Load Embeddings

Two embedding matrices have been used. Glove, and paragram. The mean of the two is used as the final embedding matrix

In [7]:
# missing entries in the embedding are set using np.random.normal so we have to seed here too
# seed_everything()

# glove_embeddings = load_glove(word_index)
paragram_embeddings = load_pymag(word_index)

# embedding_matrix = np.mean([glove_embeddings, paragram_embeddings], axis=0)
embedding_matrix = paragram_embeddings

# vocab = build_vocab(df['title'])
# add_lower(embedding_matrix, vocab)
# del glove_embeddings, paragram_embeddings
del paragram_embeddings
gc.collect()

np.shape(embedding_matrix)

(58869, 900)


HBox(children=(IntProgress(value=0, max=58868), HTML(value='')))




(58869, 900)

In [8]:
np.save("ndsc-loaded/embedding_matrix.npy",embedding_matrix)

In [4]:
embedding_matrix = np.load("ndsc-loaded/embedding_matrix.npy")

# Viewing the data

In [6]:
print(np.shape(x_train))
x_train[:2] 

(666615, 100)


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  176, 8498, 5533, 1056,   83,
         330],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [7]:
print(np.shape(y_train))
y_train[:2] 

(666615,)


array([0, 1])

In [8]:
print(np.shape(x_test))
x_test[:2]

(172402, 100)


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 2420,   54,   82,    6,   22,   43,  110,  357,
          68],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [9]:
print(np.shape(features))
features[:2]

(666615, 2)


array([[0.        , 0.24602183],
       [0.        , 0.24602183]])

In [10]:
print(np.shape(test_features))
test_features[:2]

(172402, 2)


array([[0.        , 0.24602183],
       [0.        , 0.24602183]])

In [11]:
print(type(word_index))
print(len(word_index))
print(list(word_index.items())[0])
print({k:v for k, v in word_index.items() if v == 0})
print({k:v for k, v in word_index.items() if v == -1})
print({k:v for k, v in word_index.items() if v == 1})
print({k:v for k, v in word_index.items() if v == 48861})
print({k:v for k, v in word_index.items() if v == 48862})
print(word_index["a"])
print(word_index["pikachu"])
print(word_index["1"])
# print(word_index["uberxxxLLL"])

<class 'dict'>
58868
('og', 11227)
{}
{}
{'punct': 1}
{'posti': 48861}
{'jumbobahan': 48862}
33
3364
11


In [12]:
print(np.shape(embedding_matrix))
print(len(word_index)) # i don't understand why different
embedding_matrix[:]

(58869, 900)
58868


array([[-4.86008615e-02, -2.47209959e-01,  4.25214564e-01, ...,
         4.09686211e-02, -2.67227033e-01,  7.13360913e-02],
       [-4.70000014e-05, -4.31959983e-03, -6.47940021e-03, ...,
         3.70776989e-02,  3.47048007e-02,  3.50014009e-02],
       [ 3.61792614e-02,  3.10000380e-02, -3.05049249e-02, ...,
        -3.33242007e-02,  4.21927013e-02, -6.58420995e-02],
       ...,
       [-7.39496072e-02, -2.07259700e-03,  1.51993219e-02, ...,
         4.11464438e-03, -6.94485750e-02,  7.99540208e-02],
       [-1.10526501e-02, -3.86570152e-02,  1.71388346e-02, ...,
         5.62560981e-02, -8.74006260e-02, -1.46982725e-02],
       [ 2.53226103e-02,  2.22280888e-02, -8.11862332e-02, ...,
        -7.55290388e-03,  3.23058905e-02,  6.12565262e-03]])