# Quora Insincere Questions Classification
> Detect toxic content to improve online conversations

In [1]:
import os
import sys
import gc
import glob
import time
import re
import random
import threading
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
from tqdm import tqdm
tqdm.pandas()

import tensorflow as tf
import keras
from keras import Model
from keras.layers import *
from keras.layers.merge import _Merge
from keras.models import *
from keras.initializers import *
from keras.optimizers import *
from keras.callbacks import *
from keras.regularizers import *
from keras import backend as K
from keras.legacy import interfaces
from keras.engine.topology import Layer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.generic_utils import serialize_keras_object
from keras.utils.generic_utils import deserialize_keras_object
from keras.utils import multi_gpu_model

from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
SEED = 2018
# python
os.environ['PYTHONHASHSEED'] = str(SEED)
# random
np.random.seed(SEED)
tf.set_random_seed(SEED)
# tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
session_conf.gpu_options.allow_growth = True
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
# data
max_features = 95000
maxlen = 66
cv = True

In [3]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)
sub = test[['qid']]

Train shape:  (1306122, 3)
Test shape:  (56370, 2)


## 整个文本中非字母和数字的字符

In [4]:
punct1 = [char for line in train.question_text for char in line if not char.isalnum()]
punct2 = [char for line in test.question_text for char in line if not char.isalnum()]

puncs = set(punct1 + punct2)
# 去掉空格字符
puncs = set(puncs) - set(' ')
# 去掉了string的停用词
unpunc = puncs - set(punctuation)

## 缩写词、无法识别字符、英美拼写不同的词

In [5]:
contraction = { "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "haven ' t""he'd": "he would","he'll": "he will", "he's": "he is",
                "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
                "i'm": "i am", "i've": "i have", "i'd": "i would", "i'd've": "i would have",
                "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have",
                "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will",
                "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
                "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is",
                "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                "you'll've": "you will have", "you're": "you are", "you've": "you have" }

punc = {"ँ": "", "◦": "", "̆": "", "✏": "", "": "", "ี": "", "♡": "o", "△": "", "⇒": "", "": "", "＄": " dollar ",
        "→": "", "͚": "", "️": "", "⟩": "", "¡": "i", "್": "", "‬": "", "̘": "", "ា": "", "¿": "?", "⧼": "",
        "": "", "®": " r ", "ौ": "", "∼": "", "َ": "", "ూ": "", "”": "'", "̙": "", "⋅": "", "̷": "", "̓": "", "、": "",
        "⬇": "", "̔": "", "∗": "*", "͕": "", "͡": "", "̿": "", "‌": "", "͜": "", "̦": "", "": "", "♨": "", "̮": "",
        "ௌ": "", "»": " ", "➡": "", "̼": "", "̌": "", "̢": "", "？": "?", "": "", "ৃ": "", "ం": "", "⊥": "",
        "̧": "", "ਾ": "", "》": " ", "ਂ": "", "ិ": "", "∨": "", "ী": "", "े": "", "⧽": "", "⁡": "", "ु": "",
        "ٌ": "", "₦": " naira ", "̸": "", "़": "", "̃": "", "": "", "͎": "", "∧": "", "，": "", "÷": "/", "،": "",
        "↓": "", "✔": "", "⁠": "", "¶": "", "ೋ": "", "͖": "", "ে": "", "☝": "", "«": " ", "": "", "ं": "",
        "《": " ", "ॉ": "", "）": "", "͉": "", "⟨": "", "": "", "ْ": "", "‏": "", "₱": " peso ", "°": "",
        "͋": "", "✌": "", "্": "", "᠌": "", "♣": "", "×": "x", "ো": "", "؟": "?", "˜": "", "̩": "", "̱": "",
        "̺": "", "͔": "", "▾": "", "⎛": "", "ొ": "", "்": "", "̊": "", "̥": "", "ੁ": "", "่": "", "﻿": "", "˚": "",
        "ా": "", "ા": "", "™": " tm ", "ِ": "", "∈": "", "⃗": "", "≅": "=", "̵": "", "♭": "", "ಾ": "", "；": ".",
        "̒": "", "ி": "", "´": "'", "＞": ">", "̣": "", "ุ": "", "ّ": "", "▒": "", "।": "", "–": "-", "∖": "",
        "̰": "", "ॄ": "", "‘": "'", "̶": "-", "ो": "", "！": "!", "☺": "", "̎": "", "″": "", "＝": "=", "˂": "",
        "਼": "", "ः": "", "ֿ": "", "♏": "", "¦": "", "̝": "", "̈": "", "́": "", "‐": "-", "“": "'", "ാ": "",
        "≤": "<=", "ੀ": "", "": "", "\n": "", "◌": "", "ृ": "", "ு": "", "ा": "", "¥": " yen ", "‑": "-",
        "￼": "", "": "", "्": "", "̭": "", "": "", "¬": "", "͌": "", "̍": "", "„": "", "ី": "", "•": "", "↑": "",
        "͘": "", "": "", "͇": "", "̫": "", "ா": "", "͛": "", "︠": "", "⁻": "-", "᾽": "", "ি": "", "̟": "", "│": "|",
        "̕": "", "͊": "", "̑": "", "‎": "", "☁": "", "ಿ": "", "ी": "", "̀": "", "়": "", "̐": "", "☉": "", "": "",
        "⚧": "", "£": " pound ", "・": "", "⋯": "...", "−": "-", "∅": " ", "¸": ",", "̋": "", "̲": "", "⎝": "",
        "͆": "", "〗": "]", "／": "", "ั": "", "：": "", "ோ": "", "̽": "", "©": " c ", "": "", "്": "", "ು": "",
        "ు": "", "్": "", "ि": "", "⊨": "", "̈́": "", "̚": "", "̖": "", "̡": "", "·": ".", "✅": "", "ͅ": "",
        "ੰ": "", "̾": "", "…": "", "＾": "^", "≈": "=", "—": "-", "♀": "", "❤": "", "્": "", "ା": "", "¢": "",
        "⎞": "", "ె": "", "​": "", "̻": "", "（": "", "‪": "", "≠": "!=", "ॢ": "", "ં": "", "〖": "[", "­": "", "∂": "",
        "̬": "", "͐": "", "": "", "₊": "+", "℅": "%", "̛": "", "‰": "", "ਿ": "", "͈": "", "́": "", "͂": "", "̞": "",
        "ి": "", "้": "", "̗": "", "ു": "", "": "", "’": "'", "া": "", "ើ": "", "": "", "ះ": "", "」": "]", "︡": "",
        "ू": "", "̳": "", "ை": "", "⊂": "", "∇": "", "≥": ">=", "̄": "", "₹": " e ", "̜": "", "̴": "", "℃": "",
        "±": "+", "⌚": " time ", "≡": "", "̹": "", "̯": "", "′": "", "ీ": "", "ូ": "", "－": " ", "「": "[", "̀": "",
        "¨": "'", "ॣ": "", "⦁": "", "€": " euro ", "❓": "?", "ู": "", "͗": "", "̅": "", "̂": "", "͠": "", "̤": "",
        "្": "", "̉": "", "₩": "", "": "", "̪": "", "ै": "", "∘": "", "ៃ": "", "͑": "", "ំ": "", "͒": "", "☹": "",
        "͝": "", "‛": "'", "⎠": "", "¯": "", "。": ".", "∆": "", "ി": "", "̓": "", "∝": "", "†": "", "≱": "", "²": "2",
        "`": "'", 'à': 'a', '³': '3', 'π': 'pi', "₁": "1", "₃": "3", "₆": "6", "¼": "1/4", "⁷": "7", "¾": "3/4",
        "⁵": "5", "₅": "5", "½": "1/2", "₄": "4", "⅔": "2/3", "₂": "2", "¹": "1"}

mispell = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
           'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
           'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu': 'youtube ',
           'qoura': 'quora', 'quorans': 'quora users', 'quoran': 'quora user', 'sallary': 'salary', 'whta': 'what',
           'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much',
           'howmany': 'how many', 'whydo': 'why do', 'doi': 'do i', 'thebest': 'the best', 'howdoes': 'how does',
           'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
           'pennis': 'penis', 'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
           '2k15': '2015', '2k16': '2016', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend',
           'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
           'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon',
           'nanodegree': 'nano degree', 'brexit': 'british exit', 'cryptocurrencies': 'crypto currencies',
           'coinbase': 'coin base', 'oneplus': 'one plus', 'redmi': 'red mi', 'GDPR': 'general data protection regulation',
           'DCEU': 'dc extended universe', 'litecoin': 'lite coin', 'unacademy': 'non academy', 'altcoin': 'bitcoin alternative',
           'altcoins': 'bitcoin alternative', 'sjw': 'social justice warriors', 'sjws': 'social justice warriors',
           'fiancé': 'fiance', 'microservices': 'micro services', 'bitconnect': 'bit connect', 'codeforces': 'code forces',
           'wannacry': 'wanna cry', 'onedrive': 'one drive', 'airpods': 'air pods', 'twinflame': 'twin flame',
           'undergraduation': 'under graduation', 'cos2x': 'cos 2 x', 'yourquote': 'your quote', 'xiomi': 'xiaomi',
           'undertale': 'under tale', 'genderfluid': 'gender fluid', 'são': 'sao', 'chapterwise': 'chapter wise',
           'deepmind': 'deep mind', '': '', 'arrowverse': 'arrow verse', 'overbrace': ' ', 'tensorflow': 'tensor flow',
           'hackerrank': 'hacker rank', 'microservice': 'micro service', 'reactjs': 'react js', 'hackerearth': 'hacker earth',
           'fiancée': 'fiance', 'blockchains': 'block chains', 'beyoncé': 'beyonce', 'neuralink': 'neura link',
           'openai': 'open ai', 'zoomcar': 'zoom car', 'hyperconjugation': 'hyper conjugation', 'autoencoder': 'auto encoder',
           'webassembly': 'web assembly', 'quoras': 'quora', 'digilocker': 'digi locker', 'oversmart': 'over smart',
           'cryptocoins': 'crypto coins', 'crytocurrencies': 'cryto currencies', 'cyrptocurrency': 'cyrpto currency',
           'café': 'cafe', 'whatapp': 'whatsapp', 'gaslighter': 'gas lighter', 'darkweb': 'dark web', 'webnovel': 'web novel'}

## 数据预处理：
* 添加特征：单词数、字母数、大写字母数、标点符号数、平均单词长度等特征
* 字符串全部变小写
* ‘’‘’这四个标点符号替换为'
* 对缩写和英美差别的词进行替换
* 将标点符号替换为 空格+标点符号+空格（以免有些词和标点符号无法划分开）
* 将数字替换为##...

In [6]:
def replace_quote(text):
    # 替换标点符号
    quote = ['´', '‘', '’', "`"]
    for s in quote:
        text = text.replace(s, "'")
    return text
                      
def re_mapping(mapping):
    res = re.compile('(%s)' % '|'.join(mapping.keys()))
    return res

# 两个字典取并集
mapping = dict(set(contraction.items()) | set(mispell.items()))
re_map = re_mapping(mapping)
re_punc = re_mapping(punc)

def replace_mapping(text):
    def replace(match):
        return mapping[match.group(0)]
    return re_map.sub(replace, text)

def replace_punc(text):
    def replace(match):
        return punc[match.group(0)]
    return re_punc.sub(replace, text)

def sep_punc(x):
    # 将“标点符号”用“ 标点符号 ”来代替
    for p in puncs:
        x = x.replace(p, f' {p} ')
    return x

def replace_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def add_features(df):
    df['question_text'] = df['question_text'].progress_apply(lambda x: str(x))
    # 字符串的长度，字母的数量
    df['num_chars'] = df['question_text'].progress_apply(len)
    # 单词的数量
    df['num_words'] = df.question_text.str.count('\S+')
    # 字符串中大写字母的数量
    df['num_capital'] = df['question_text'].progress_apply(lambda x: sum(1 for c in x if c.isupper()))
    # 大写字母数占总字母数的比率
    df['capital_rate'] = df['num_capital'] / df['num_chars']
    
    # 不重复单词的种数
    df['num_uniquewords'] = df['question_text'].progress_apply(lambda x: len(set(x.split())))
    df['unique_rate'] = df['num_uniquewords'] / df['num_words']
    
    # istitle()字符串中所有单词首字母大写则为真，也就是统计首字母大写的单次数
    df["num_titlewords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.istitle()]))
    # 词频
    df['title_rate'] = df['num_titlewords'] / df['num_words']
    
    # 字符串中所有字母大写则为真
    df["num_upperwords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.isupper()]))
    df['upper_rate'] = df['num_upperwords'] / df['num_words']
    
    # 统计“！”的数目
    df["num_exc"] = df["question_text"].progress_apply(lambda x: x.count("!")).astype('uint16')
    # 统计“？”的数目
    df["num_q"] = df['question_text'].progress_apply(lambda x: x.count("?")).astype('uint16')
    # 单词长度的平均值
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split()]))
    # 单词长度的最大值
    df["max_word_len"] = df['question_text'].progress_apply(lambda x: max([len(w) for w in x.split()]))
    # 特殊字符的数目
    df["num_unpunc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in unpunc)).astype('uint16')
    df["num_punc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in punctuation)).astype('uint16')
    # 错拼词的数目
    df["num_mispell"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in mispell)).astype('uint16')

    return df

In [7]:
# 0.699 f10
# feature_cols = ['capital_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len",
#                 'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc"]

# 0.697 f13
# feature_cols = ['capital_rate', 'unique_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len",
#                 'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc", "num_mispell"]

# 0.697 f12
# feature_cols = ['capital_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len",
#                 'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc", "num_mispell"]

# f11
feature_cols = ['capital_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len", 'num_capital',
                "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc"]

In [8]:
def lower():
# Lower
    train["question_text"] = train["question_text"].str.lower()
    test["question_text"] = test["question_text"].str.lower()
    print("Lower done")

def Replace_quote():
# Replace quote
    train['question_text'] = train['question_text'].apply(lambda x: replace_quote(x))
    test['question_text'] = test['question_text'].apply(lambda x: replace_quote(x))
    print("Replace quote done")

def Replace_mapping():
# Replace mapping(contraction & mispell)
    train['question_text'] = train['question_text'].apply(lambda x: replace_mapping(x))
    test['question_text'] = test['question_text'].apply(lambda x: replace_mapping(x))
    print("Replace mapping done")

def Sep_punc():
# # Sep punc
    train['question_text'] = train['question_text'].apply(lambda x: sep_punc(x))
    test['question_text'] = test['question_text'].apply(lambda x: sep_punc(x))
    print("Sep punc done")

In [9]:
threads = []

t2 = threading.Thread(target=Replace_quote)
t3 = threading.Thread(target=Replace_mapping)
t4 = threading.Thread(target=Sep_punc)
threads = [t2, t3, t4]
for t in threads:
    t.setDaemon(True)
    t.start()

Replace quote done


In [None]:
# Add features
train = add_features(train)
test = add_features(test)
features = train[feature_cols].fillna(0)
test_features = test[feature_cols].fillna(0)
# 对添加的特征进行归一化
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)
print("Add features done")

def lower():
# Lower
    train["question_text"] = train["question_text"].str.lower()
    test["question_text"] = test["question_text"].str.lower()
    print("Lower done")

def Replace_quote():
# Replace quote
    train['question_text'] = train['question_text'].progress_apply(lambda x: replace_quote(x))
    test['question_text'] = test['question_text'].progress_apply(lambda x: replace_quote(x))
    print("Replace quote done")

def Replace_mapping():
# Replace mapping(contraction & mispell)
    train['question_text'] = train['question_text'].progress_apply(lambda x: replace_mapping(x))
    test['question_text'] = test['question_text'].progress_apply(lambda x: replace_mapping(x))
    print("Replace mapping done")

def Sep_punc():
# # Sep punc
    train['question_text'] = train['question_text'].progress_apply(lambda x: sep_punc(x))
    test['question_text'] = test['question_text'].progress_apply(lambda x: sep_punc(x))
    print("Sep punc done")

In [11]:
train_X = train['question_text']
test_X = test['question_text']
# filters注意设置，因为标点符号被保留
tokenizer = Tokenizer(num_words=max_features, filters='')
tokenizer.fit_on_texts(train_X.tolist())
train_X = tokenizer.texts_to_sequences(train_X)
train_X = pad_sequences(train_X, maxlen=maxlen)

test_X = tokenizer.texts_to_sequences(test_X)
test_X = pad_sequences(test_X, maxlen=maxlen)
train_y = train['target'].values
# 通过word_index与Embedding实现一一对应
word_index = tokenizer.word_index

Replace mapping done
Sep punc done


In [12]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_glove():
    
    vec_path = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(vec_path, encoding='latin'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_para():
    vec_path =  "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt"
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(vec_path, encoding='latin'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [13]:
start_time = time.time()
t1 = load_glove()
t2 = load_para()
threads = [t1, t2]

for t in threads:
    t.setDaemon(True)
    t.start()

total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))

FileNotFoundError: [Errno 2] No such file or directory: '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'

In [None]:
emb_glove = load_glove()
emb_para =load_para()

In [None]:
emb = np.mean([emb_glove, emb_para], axis=0)
print(np.shape(emb))

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [None]:
def f1_smart(y_true, y_pred):
    # y_pred的排序索引
    args = np.argsort(y_pred)
    # 为1的数量
    tp = y_true.sum()
    
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

def threshold_search(y_true, y_pred):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(25,45)]:
        score = f1_score(y_true=y_true, y_pred=(y_pred > threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    return best_score, best_threshold

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [None]:
class LstmAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.5)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = Attention(maxlen)(x)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)
        x = concatenate([atn_1, atn_2, avg_pool, max_pool])
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1])
        return model

class LstmFAtnCap():
    def model(self, embedding_matrix, maxlen, max_features):
        inp_seq = Input(shape=(maxlen,), name='seq')
        inp_feature = Input(shape=(len(feature_cols),), name='feature')
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp_seq)
        
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        # last = Lambda(lambda t: t[:, -1])(y)
        atn_1 = Attention(maxlen)(y)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)
        
        capsule = Capsule(num_capsule=10, dim_capsule=16, routings=5,
                          share_weights=True)(x)
        capsule = Flatten()(capsule)
        capsule = Dropout(0.25)(capsule)
        
        x = concatenate([atn_1, atn_2, avg_pool, max_pool, inp_feature, capsule])
        x = Dense(16, activation='relu', kernel_initializer=glorot_normal(seed=SEED))(x)
        x = Dropout(0.1)(x)

        output = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=[inp_seq, inp_feature], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy',f1])
        return model

    
class LstmFAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp_seq = Input(shape=(maxlen,), name='seq')
        inp_feature = Input(shape=(len(feature_cols),), name='feature')
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp_seq)
        
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        # last = Lambda(lambda t: t[:, -1])(y)
        atn_1 = Attention(maxlen)(y)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)

        x = concatenate([atn_1, atn_2, avg_pool, max_pool, inp_feature])
        x = Dense(32, activation='relu', kernel_initializer=glorot_normal(seed=SEED))(x)
        x = Dropout(0.1)(x)

        output = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=[inp_seq, inp_feature], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy',f1])
        return model

In [None]:
from sklearn import metrics
def train_single():
    X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2,random_state=2018, stratify=train_y)
    model = LstmAtn().model(embed_glove, maxlen, max_features)
    history = model.fit(X_train, y_train, batch_size=256, epochs=5, validation_data=(X_val, y_val),)
    pred_glove_val_y = model.predict([X_val], batch_size=1024, verbose=1)
    best_threshold = 0
    best_score = 0

    for threshold in [i * 0.01 for i in range(25,45)]:
        score = metrics.f1_score(y_val, (pred_glove_val_y>threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    print('best score:%f,best threshold:%f'%(best_score, best_threshold))
    pred_test_y = model.predict(test_X, batch_size=1024, verbose=1)
    pred_test_y = (pred_test_y > best_threshold).astype(int)
    gc.collect()
    K.clear_session()
    tf.reset_default_graph()
    return pred_test_y

def train_cv():
    kfolds, epochs = 5, 5
    kf = StratifiedKFold(n_splits=kfolds, random_state=26, shuffle=True).split(train_X, train_y)
    loss = []
    thresh = []
    # 词向量与特征向量融合，注意区分
    train_meta = np.zeros((train_X.shape[0],1))
    test_meta = np.zeros((test_X.shape[0],1))
    x_test = [test_X, test_features]
    for i, (train_idx, valid_idx) in enumerate(kf):

        X_train, X_val, Y_train, Y_val = train_X[train_idx], train_X[valid_idx], train_y[train_idx], train_y[valid_idx]
        features_train = features[train_idx]
        features_val= features[valid_idx]
        
        x_train = [X_train, features_train]
        x_val = [X_val, features_val]
        # 模型定义
        model = LstmFAtn().model(emb, maxlen, max_features)
        if i == 0: print(model.summary())
        print('====================================Fold:%d========================================'%(i+1))
        # early stopping callbacks
        early_stop = EarlyStopping(monitor="val_f1", mode="max", patience=5, verbose=True,restore_best_weights=True)
#         clr = CyclicLR(base_lr=0.001, max_lr=0.003, step_size=300, mode='exp_range',gamma=0.99994)
        history = model.fit(x_train, Y_train, batch_size=512, epochs=epochs, 
                            validation_data=(x_val, Y_val), callbacks=[early_stop])
        
        pred_val_y = model.predict(x_val, batch_size=1024,)
        
        best_threshold = 0
        best_score = 0
        for threshold in [i * 0.01 for i in range(25,45)]:
            score = metrics.f1_score(Y_val, (pred_val_y>threshold).astype(int))
            if score > best_score:
                best_threshold = threshold
                best_score = score
        print('best score:%f,best threshold:%f'%(best_score, best_threshold))
        
        pred_test_y = model.predict(x_test, batch_size=1024,)
        
        train_meta[valid_idx] = pred_val_y
        test_meta += pred_test_y / kfolds

#     best_score, best_thresh = f1_smart(train_y, train_meta)
#     print(best_score, best_thresh)
    best_score1,best_thresh1 = threshold_search(train_y, train_meta)
    print(best_score1, best_thresh1)
    # best_score, best_thresh = threshold_search(np.squeeze(Y), train_meta)
#     print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(best_score, best_thresh))
#     print('mean_thresh: {:.4f} and mean_loss: {:.4f}'.format(np.mean(thresh), np.mean(loss)))
    test_meta = test_meta.reshape((-1, 1))
    pred_test_y = (test_meta > best_thresh1).astype(int)
    return pred_test_y

In [None]:
pred_test_y = train_cv()

In [None]:
submit = pd.read_csv("../input/sample_submission.csv")
submit['prediction'] = pred_test_y
submit.to_csv('submission.csv',index=False)

训练日志：
* Optimal F1: 0.6874 at threshold: 0.3162 - val_loss: 0.0965
* Optimal F1: 0.6879 at threshold: 0.3589- val_loss: 0.0965
* Optimal F1: 0.6945 at threshold: 0.3551 - val_loss: 0.0947
* Optimal F1: 0.6903 at threshold: 0.3207- val_loss: 0.0951
* Optimal F1: 0.6859 at threshold: 0.3639- val_loss: 0.0966
* 所有数据：Optimal F1: 0.6888 at threshold: 0.3612


### epoch = 7    batch_size = 512
best score:0.689091,best threshold:0.330000  
best score:0.694275,best threshold:0.430000  
best score:0.691881,best threshold:0.340000  
best score:0.691157,best threshold:0.410000  
best score:0.685592,best threshold:0.440000  
0.6893022238341134 0.37    

### epoch = 5 batch_size = 512
best score:0.692113,best threshold:0.440000  
best score:0.688417,best threshold:0.310000  
best score:0.693517,best threshold:0.320000  
best score:0.693974,best threshold:0.430000  
best score:0.687383,best threshold:0.430000  
0.6895025813324592 0.4  