**Word2Vec**  
The main idea behind it is that you train a model on the context on each word, so similar words will have similar numerical representations.  
**GLOVE**  
GLOVE works similarly as Word2Vec. While you can see above that Word2Vec is a "predictive" model that predicts context given word, GLOVE learns by constructing a co-occurrence matrix (words X context) that basically count how frequently a word appears in a context. Since it's going to be a gigantic matrix, we factorize this matrix to achieve a lower-dimension representation.   
**FastText**  
FastText is quite different from the above 2 embeddings. While Word2Vec and GLOVE treats each word as the smallest unit to train on, FastText uses n-gram characters as the smallest unit. For example, the word vector ,"apple", could be broken down into separate word vectors units as "ap","app","ple". The biggest benefit of using FastText is that it generate better word embeddings for rare words, or even words not seen during training because the n-gram character vectors are shared with other words.   

- https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
- https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge
- https://www.kaggle.com/shujian/mix-of-nn-models-based-on-meta-embedding

In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, Conv2D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D, concatenate, BatchNormalization
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras.callbacks import *

from gensim.models import KeyedVectors
import gc
import nltk
from nltk import word_tokenize
import string
import pickle

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import re
tqdm.pandas()

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 72 # max number of words in a question to use
num_ext_features = 12 # how many engineered features to use

In [None]:
# read the data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

**Add new features**

In [None]:
# count the number of question marks
def count_question_mark(x):
    count = 0
    for character in x:
        if character == '?':
            count += 1
    return count

# count the number of words
def count_token(x): return len(x.split())

# count the number of question leading words
def count_wh_word(x):
    wh_words = ['Who', 'Whose', 'What', 'Which', 'Why', 'When', 'How', 'Whose', 'Whom',
                'Who\'s', 'What\'s', 'Can', 'Do', 'Does', 'Should', 'Would', 'Could', 'Will']
    count = 0
    for token in x.split():
        if token in wh_words:
            count += 1
    return count

# count the number of unique words
def count_unique_word(x): return len(set(str(x).split()))    

# count the number of characters
def count_character(x): return len(str(x))

stopwords = list(STOP_WORDS)
# count the number of stopwords
def count_stop_word(x):
    count = 0
    for token in x.lower().split():
        if token in stopwords:
            count += 1
    return count

# count the number of punctuations
def count_punc(x): return len([c for c in str(x) if c in string.punctuation])

# average length of the word
def count_avg_length(x): return np.mean([len(w) for w in str(x).split()])

# count the number of top 200 insincere_words (unigram)
top_200_insincere_words = ['liberals', 'jews', 'racist', 'christians', 'hindus', 'democrats', 'stupid', 'hillary', 'realize',
                           'rape', 'atheists', 'supporters', 'clinton', 'conservatives', 'liberal', 'jewish', 'immigrants', 
                           'europeans', 'republicans', 'majority', 'they', 'penis', 'guns', 'killing', 'feminists', 'blacks', 
                           'white', 'asians', 'terrorists', 'ugly', 'pakistani', 'terrorist', 'racism', 'fuck', 'western', 
                           'since', 'shit', 'black', 'palestinians', 'evil', 'dumb', 'islamic', 'transgender', 
                           'conservative', 'arabs', 'freedom', 'obsessed', 'superior', 'pakistanis', 'destroy', 'blame', 
                           'put', 'whites', 'lie', 'russians', 'males', 'crimes', 'ignorant', 'god', 'ban', 'sexually', 
                           'congress', 'africans', 'hell', 'terrorism', 'violence', 'raped', 'innocent', 'atheist', 
                           'shooting', 'admit', 'lies', 'simply', 'rude', 'worse', 'dick', 'christianity', 'quo', 'lgbt', 
                           'republican', 'western', 'females', 'party', 'democrats', 'himself', 'suck', 'lack', 'immigration',
                           'calling', 'politicians', 'murder', 'now', 'turks', 'peace', 'deny', 'propaganda', 'israeli', 
                           'arab', 'violent', 'castrated', 'gandhi', 'refugees', 'attracted', 'homosexuality', 'trump', 
                           'generally', 'leaders', 'abuse', 'educated', 'proud', 'kashmir', 'mostly', 'syria', 'tend', 
                           'genocide', 'nazi', 'feminism', 'nations', 'flat', 'slaves', 'hatred', 'britain', 'middle', 'angry',
                           'liberals', 'germans', 'responsible', 'abortion', 'fair', 'caste', 'commit', 'minorities', 'slavery',
                           'people', 'jealous', 'democratic', 'dislike', 'incest', 'supporting', 'religions', 'ian', 'feminist',
                           'turkey', 'aware', 'fbi', 'kim', 'ashamed', 'deserve', 'holocaust', 'mueller', 'minority', 'refuse', 
                           'finally', 'victims', 'crazy', 'ndra', 'muhammad', 'gays', 'complain', 'soldiers', 'blind', 'nazis',
                           'lazy', 'democratic', 'democrat', 'proof', 'lying', 'ethnic', 'jew', 'corrupt', 'brahmins', 'ignore', 
                           'bill', 'anti', 'defend', 'cousin', 'babies', 'south', 'millions', 'wives', 'homosexual', 'allah',
                           'privilege', 'wearing', 'mentally', 'canadians', 'banned', 'arrogant', 'barack', 'turkish', 'attacks',
                           'races', 'beat', 'fucking', 'shootings', 'voters', 'committed', 'voted']

# count the number of top 20 insincere_words (unigram)
top_20_insincere_words = top_200_insincere_words[:20]
def count_insincere_word(x):
    count = 0
    for token in x.lower().split():
        if token in top_20_insincere_words:
            count += 1
    return count

# count the number of top 100 insincere_words (unigram)
top_100_insincere_words = top_200_insincere_words[:100]
def count_100_insincere_word(x):
    count = 0
    for token in x.lower().split():
        if token in top_100_insincere_words:
            count += 1
    return count

# count the number of top 200 insincere_words (unigram)
def count_200_insincere_word(x):
    count = 0
    for token in x.lower().split():
        if token in top_200_insincere_words:
            count += 1
    return count

# count the number of another 100 insincere_words (unigram)
another_100_insincere_words = ['castrated','muslims','democrats','liberals','castrate','indians','trump','americans','women',
                               'blacks','jews','feminists','atheists','castration','obama','homosexuals','hillary','rape',
                               'hindus','fuck','shit','idiots','muslim','girls','christians','gay','whites','holocaust',
                               'asians','stupid','tamils','ass','gays','jew','chinese','incest','leftists','black','crap',
                               'men','homosexuality','white','conservatives','idiot','brahmins','modi','republicans','tamilians',
                               'bullshit','moron','losers','terrorists','raping','fucking','moderators','shithole','dick',
                               'palestinians','pakistanis','bhakts','europeans','liberal','jewish','penis','homosexual',
                               'turks','nonsense','africans','tennessee','asshole','hypocrisy','mexicans','cousin','assholes',
                               'israelis','realize','clinton','canadians','bengalis','indian','israel','uneducated','brits',
                               'alabamians','transgender','morons','bitch','leftist','sister','aunty','democrat','supporters',
                               'females','cock','castrating','folks','pussy','terrorist','racist','genocide']
def count_another_100_insincere_word(x):
    count = 0
    for token in x.lower().split():
        if token in another_100_insincere_words:
            count += 1
    return count

In [None]:
new_cols = ['num_question_mark', 'num_token', 'num_wh_word', 'num_unique_word', 'num_character',
            'num_stopword', 'num_punc', 'num_avg_length', 'num_20_insincere_word', 'num_100_insincere_word', 
            'num_200_insincere_word', 'num_another_100_insincere_word']
new_cols_func = [count_question_mark, count_token, count_wh_word, count_unique_word, count_character,
                 count_stop_word, count_punc, count_avg_length, count_insincere_word, count_100_insincere_word,
                 count_200_insincere_word, count_another_100_insincere_word]

# new_cols = ['num_question_mark', 'num_token', 'num_wh_word', 'num_insincere_words']
# new_cols_func = [count_question_mark, count_token, count_wh_word, count_insincere_word]

for i, new_col in enumerate(new_cols):
    train_df[new_col] = train_df['question_text'].progress_apply(new_cols_func[i])
    train_df[new_col] = (train_df[new_col]-train_df[new_col].mean())/train_df[new_col].std()
    test_df[new_col] = test_df['question_text'].progress_apply(new_cols_func[i])
    test_df[new_col] = (test_df[new_col]-test_df[new_col].mean())/test_df[new_col].std()

In [None]:
train_df[train_df.target==1].head(5)

In [None]:
min_num_200_insincere_word = train_df.num_200_insincere_word.min()
len(train_df[(train_df.target==1)&(train_df.num_200_insincere_word>min_num_200_insincere_word)]) / len(train_df[(train_df.target==1)])

In [None]:
min_num_100_insincere_word = train_df.num_100_insincere_word.min()
len(train_df[(train_df.target==1)&(train_df.num_100_insincere_word>min_num_100_insincere_word)]) / len(train_df[(train_df.target==1)])

In [None]:
min_num_20_insincere_word = train_df.num_20_insincere_word.min()
len(train_df[(train_df.target==1)&(train_df.num_20_insincere_word>min_num_20_insincere_word)]) / len(train_df[(train_df.target==1)])

**Applying Latent Dirichlet Allocation(LDA) models**

In [None]:
# punctuations = string.punctuation
# stopwords = list(STOP_WORDS)
# parser = English()
# def spacy_tokenizer(sentence):
#     mytokens = parser(sentence)
#     mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
#     mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
#     mytokens = " ".join([i for i in mytokens])
#     return mytokens
# train_df['topictext'] = train_df["question_text"].progress_apply(spacy_tokenizer)
# test_df['topictext'] = test_df["question_text"].progress_apply(spacy_tokenizer)

In [None]:
# %%time
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
# vectorized = vectorizer.fit_transform(train_df['topictext'])

# # Latent Dirichlet Allocation Model
# lda_model = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online',verbose=True)
# lda_model.fit(vectorized)

**Preprocessing when using embeddings**
- Don't use standard preprocessing steps like stemming or stopword removal when you have pre-trained embeddings
- Get your vocabulary as close to the embeddings as possible

- https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/

In [None]:
# glove text preprocessing
def clean_contractions(text):
    mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def clean_special_chars(text):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punct:
        text = text.replace(p, f' {p} ')
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    return text

def correct_spelling(x):
    dic = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon'}
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

# glove text preprocessing
train_df['glove_text'] = train_df['question_text'].progress_apply(lambda x: x.lower())
train_df['glove_text'] = train_df['glove_text'].progress_apply(lambda x: clean_contractions(x))
train_df['glove_text'] = train_df['glove_text'].progress_apply(lambda x: clean_special_chars(x))
train_df['glove_text'] = train_df['glove_text'].progress_apply(lambda x: correct_spelling(x))

test_df['glove_text'] = test_df['question_text'].progress_apply(lambda x: x.lower())
test_df['glove_text'] = test_df['glove_text'].progress_apply(lambda x: clean_contractions(x))
test_df['glove_text'] = test_df['glove_text'].progress_apply(lambda x: clean_special_chars(x))
test_df['glove_text'] = test_df['glove_text'].progress_apply(lambda x: correct_spelling(x))

In [None]:
## fill up the missing values
train_X = train_df['glove_text'].fillna("_na_").values
test_X = test_df['glove_text'].fillna("_na_").values
## tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## get the target values
train_y = train_df['target'].values

In [None]:
## get the engineered features
train_X_ext = train_df[new_cols].values
test_X_ext = test_df[new_cols].values

In [None]:
# # get the topics
# train_topic = lda_model.transform(vectorizer.transform(train_df['topictext']))
# val_topic = lda_model.transform(vectorizer.transform(val_df['topictext']))
# test_topic = lda_model.transform(vectorizer.transform(test_df['topictext']))

In [None]:
# # concatenate the topic features with engineered features
# train_X_ext = np.hstack((train_X_ext, train_topic))
# val_X_ext = np.hstack((val_X_ext, val_topic))
# test_X_ext = np.hstack((test_X_ext, test_topic))

**Embedding**

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

* Take average of embeddings (Unweighted DME) instead of blending predictions: https://arxiv.org/pdf/1804.07983.pdf
* The original paper of this idea comes from: Frustratingly Easy Meta-Embedding – Computing Meta-Embeddings by Averaging Source Word Embeddings

In [None]:
embedding_matrix = np.mean([load_glove(tokenizer.word_index),load_para(tokenizer.word_index)], axis = 0)

**Save the preprocessed dataset onto local disk to reduce redundant work**

In [None]:
# def save_obj(x, filename):
#     with open(filename, 'wb') as handle:
#         pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)
# def load_obj(filename):
#     with open(filename, 'rb') as handle:
#         return pickle.load(handle)

In [None]:
# save_obj(train_X, 'train_X.pkl')
# save_obj(train_X_ext, 'train_X_ext.pkl')
# save_obj(train_y, 'train_y.pkl')
# save_obj(val_X, 'val_X.pkl')
# save_obj(val_X_ext, 'val_X_ext.pkl')
# save_obj(val_y, 'val_y.pkl')
# save_obj(test_X, 'test_X.pkl')
# save_obj(test_X_ext, 'test_X_ext.pkl')
# save_obj(embedding_matrix, 'embedding_matrix.pkl')

In [None]:
# TEMP_PATH = '../input/baseline-tweaks/'

# embedding_matrix = load_obj(TEMP_PATH+'embedding_matrix.pkl')
# train_X = load_obj(TEMP_PATH+'train_X.pkl')
# train_X_ext = load_obj(TEMP_PATH+'train_X_ext.pkl')
# train_y = load_obj(TEMP_PATH+'train_y.pkl')
# val_X = load_obj(TEMP_PATH+'val_X.pkl')
# val_X_ext = load_obj(TEMP_PATH+'val_X_ext.pkl')
# val_y = load_obj(TEMP_PATH+'val_y.pkl')
# test_X = load_obj(TEMP_PATH+'test_X.pkl')
# test_X_ext = load_obj(TEMP_PATH+'test_X_ext.pkl')

**Attention Layer**

In [None]:
from keras.engine.topology import Layer
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

**Ensemble models, e.g. CNN textClassifier, LSTM**

In [None]:
# https://www.kaggle.com/yekenot/2dcnn-textclassifier
def model_cnn():
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
def model_lstm_pl():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    avg_pl = GlobalAveragePooling1D()(x)
    max_pl = GlobalMaxPooling1D()(x)
    concat = concatenate([avg_pl, max_pl])
    x = Dense(64, activation="relu")(concat)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def model_lstm_hybrid_multi():
    main_input = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(main_input)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x_ = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    atten_1 = Attention(maxlen)(x)
    atten_2 = Attention(maxlen)(x_)
    avg_pl = GlobalAveragePooling1D()(x_)
    max_pl = GlobalMaxPooling1D()(x_)
    lstm_out = concatenate([avg_pl, max_pl])
    lstm_out = Dense(64, activation="relu")(lstm_out)
    lstm_out = Dropout(0.1)(lstm_out)
    auxiliary_output = Dense(1, activation="sigmoid", name='auxiliary_output')(lstm_out)
    
    auxiliary_input = Input(shape=(num_ext_features,), name='aux_input')
    x = concatenate([lstm_out, auxiliary_input])
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    main_output = Dense(1, activation='sigmoid', name='main_output')(x)
    
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], loss_weights=[1., 0.2])
    return model

In [None]:
def model_lstm_hybrid_single():
    main_input = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(main_input)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
    x_ = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    atten_1 = Attention(maxlen)(x)
    atten_2 = Attention(maxlen)(x_)
    avg_pl = GlobalAveragePooling1D()(x_)
    max_pl = GlobalMaxPooling1D()(x_)
    lstm_out = concatenate([atten_1, atten_2, avg_pl, max_pl])
    lstm_out = Dense(20, activation="relu")(lstm_out)
#     lstm_out = Dropout(0.1)(lstm_out)
    
    auxiliary_input = Input(shape=(num_ext_features,), name='aux_input')
    x = concatenate([lstm_out, auxiliary_input])
    x = BatchNormalization()(x)
    x = Dense(40, activation='relu')(x)
    x = Dense(40, activation='relu')(x)
    x = BatchNormalization()(x)
    main_output = Dense(1, activation='sigmoid', name='main_output')(x)
    
    model = Model(inputs=[main_input, auxiliary_input], outputs=main_output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# embedding_matrix = np.zeros((max_features, 300))
# model_lstm_hybrid_single().summary()

In [None]:
def train_pred(model, epochs=2):    
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score

        print("Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y, best_score

In [None]:
def train_pred_hybrid_multi(model, epochs=2, callbacks=None):
    for e in range(epochs):
        model.fit([train_X, train_X_ext],
                  [train_y, train_y],
                  batch_size=512,
                  epochs=1,
                  validation_data=[[val_X,val_X_ext], [val_y, val_y]],
                  callbacks = callbacks)
        
        pred_val_y_aux = model.predict([val_X,val_X_ext], batch_size=1024, verbose=0)[1]
        best_thresh = 0.5
        best_score_aux = 0.0
        for thresh in np.arange(0.1, 1.001, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y_aux > thresh).astype(int))
            if score > best_score_aux:
                best_thresh = thresh
                best_score_aux = score
        print("Aux output, Val F1 Score: {:.4f}".format(best_score_aux), f'Threshold: {best_thresh}')
        
        pred_val_y = model.predict([val_X,val_X_ext], batch_size=1024, verbose=0)[0]
        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.1, 1.001, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score
                
        print("Mainoutput, Val F1 Score: {:.4f}".format(best_score), f'Threshold: {best_thresh}')
    
    if best_score_aux > best_score:
        pred_test_y = model.predict([test_X, test_X_ext], batch_size=1024, verbose=0)[1]
    else:
        pred_test_y = model.predict([test_X, test_X_ext], batch_size=1024, verbose=0)[0]
    return pred_val_y, pred_test_y, best_score

In [None]:
def train_pred_hybrid_single(model, epochs=2):
    for e in range(epochs):
        model.fit([train_X, train_X_ext],
                  train_y,
                  batch_size=512,
                  epochs=1,
                  validation_data=[[val_X,val_X_ext], val_y])
        pred_val_y = model.predict([val_X,val_X_ext], batch_size=1024, verbose=0)

        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.1, 1.001, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score

        print("Mainoutput, Val F1 Score: {:.4f}".format(best_score), f'Threshold: {best_thresh}')

    pred_test_y = model.predict([test_X, test_X_ext], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y, best_score

In [None]:
def kfold(model, train_X, train_X_ext, train_y, val_X, val_X_ext, val_y, epochs=2, callbacks = None):
    model.fit([train_X, train_X_ext],
              train_y,
              batch_size=512,
              epochs=epochs,
              validation_data=[[val_X,val_X_ext], val_y],
              callbacks = callbacks,)
    return model

In [None]:
# outputs = []
# pred_val_y, pred_test_y, best_score = train_pred(model_cnn(word2vec_res), word2vec_res, epochs = 2)
# outputs.append([pred_val_y, pred_test_y, best_score, '2d CNN'])

In [None]:
# outputs = []
# pred_val_y, pred_test_y, best_score = train_pred(model_lstm_atten(glove_res), glove_res, epochs = 4)
# outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM w/ attention'])

In [None]:
# outputs = []
# pred_val_y, pred_test_y, best_score = train_pred(model_lstm_pl(embedding_matrix), glove_res, epochs = 4)
# outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM w/ pooling'])

In [None]:
# outputs = []
# pred_val_y, pred_test_y, best_score = train_pred_hybrid_single(model_lstm_hybrid_single(embedding_matrix), epochs = 5)
# outputs.append([pred_val_y, pred_test_y, best_score, '2 LSTM multi input single output'])

In [None]:
DATA_SPLIT_SEED = 2018
N_FOLD = 5
N_EPOCH = 5

train_meta = np.zeros(train_y.shape)
test_meta = np.zeros(test_X.shape[0])
splits = list(StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_y))
for idx, (train_idx, valid_idx) in enumerate(splits):
    filepath="weights_best.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
    earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto')
    callbacks = [checkpoint, reduce_lr]
    # callbacks = [clr,]
    X_train = train_X[train_idx]
    X_train_ext = train_X_ext[train_idx]
    y_train = train_y[train_idx]
    X_val = train_X[valid_idx]
    X_val_ext = train_X_ext[valid_idx]
    y_val = train_y[valid_idx]
    model = model_lstm_hybrid_single()
    model = kfold(model, X_train, X_train_ext, y_train, X_val, X_val_ext, y_val, epochs = N_EPOCH, callbacks = callbacks)
    train_meta[valid_idx] = model.predict([X_val, X_val_ext], batch_size=1024, verbose=0).reshape(-1)
    test_meta += model.predict([test_X, test_X_ext], batch_size=1024, verbose=0).reshape(-1) / N_FOLD

In [None]:
# https://www.kaggle.com/ryanzhang/tfidf-naivebayes-logreg-baseline
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = metrics.f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

search_result = threshold_search(train_y, train_meta)
print(search_result)

sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = (test_meta > search_result['threshold']).astype(int)
sub.to_csv("submission.csv", index=False)