In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional, CuDNNGRU, Dense, Embedding, Input, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras.engine.topology import Layer

from learningrate import CyclicLR

DATADIR = Path('../input')
EMBEDDINGDIR = DATADIR / 'embeddings'

TRAIN_CSV = DATADIR / 'train.csv'
TEST_CSV = DATADIR / 'test.csv'

Using TensorFlow backend.


## 前処理
- NaNを`_##_`で埋める
- 文書をfitすることでベクトル化
- 長さが足りていない文書の単語を空白で前から埋める

In [3]:
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

def load_and_prec(max_features: int, maxlen: int) -> pd.DataFrame:
    """Get preprocessed train and test data as pd.DataFrame and tokuenizer used in the preprocess"""
    train = pd.read_csv("../input/train.csv")
    test = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## fill up the missing values
    train_X = train["question_text"].fillna("_##_").values
    test_X = test["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train['target'].values
    
    #shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index

In [4]:
# tokenizer.word_indexが引数(文書に出て来る単語の辞書みたいなイメージ)
def load_glove(word_index):
    EMBEDDING_FILE = EMBEDDINGDIR / 'glove.840B.300d' / 'glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
 
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
 
    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
             
    return embedding_matrix
     
def load_fasttext(word_index):   
    EMBEDDING_FILE = EMBEDDINGDIR / 'wiki-news-300d-1M' / 'wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)
 
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
 
    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
 
    return embedding_matrix
 
def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
 
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
 
    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
     
    return embedding_matrix

## モデル構築
- kerasのf1scoreはrecallとprecisionの調和平均だが、kerasのmetricにf1score入れるとバッチ毎にf1scoreを計算してその算術平均を取ってしまうため新たに定義
- 参考:[KerasでF1スコアをmetircsに入れる際は要注意](https://qiita.com/koshian2/items/81abfc0a75ea99f726b9)
- トピックごとDropuotを使うための`SpatialDropout1D`
- テキストの前後から系列を学習する`Bidirectional`

In [5]:
class Attention(Layer):
    """
    https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb
    """
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, 
                 W_constraint=None, b_constraint=None, bias=True, **kwargs):
        
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],), initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],), initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
def f1(y_true, y_pred):
    """f1 measure. Refer to this article -> https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras"""
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
def GRU_Atten(input_size: int, embedding_matrix: np.ndarray) -> keras.models.Model:
    """
    Args:
        input_size (int): input size
        embedding_matrix (np.ndarray): A ndarray.
    Returns:
        model (keras.models.Model): built and compiled keras model object.
    """
    
    embed_size = 300 # how big is each word vector
    
    def build_input_layer(input_size):
        """build input layer"""
        return Input(shape=input_size, name='input')
    
    def build_embedding_layer(input_dim, output_dim, weights, input_length, trainable=False):
        """build embedding layer"""
        return Embedding(input_dim=imput_dim, output_dim=output_dim, weights=[embeddings], input_length=input_length, trainable=False)
    
    
    inp = built_input_later(input_size)
    
    embedding_layer = build_embedding_later(len(embedding_matrix), embed_size, weights=[embedding_matrix], input_length=input_size, trainable=False)
    x = embedding_layer(x)
    
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    outp = Dense(1, activation="sigmoid")(x)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    
    return model

NameError: name 'keras' is not defined

## 学習と予測
### 手順
- 1epochごとに学習する。出力はprob。
- 出力されたprobaに対して正解との比較で適切なthresholdを見つけてくる

### 工夫している点
- 1epochごとに結果を出力する`for e in range(epochs)`
- 複数のword2vecでベクトル化して平均を取ることでロバストなベクトルを得ることも可能
- CyclicLRの自作関数をcallbackに渡してる。上限学習率と下限学習率を決め、その間をバッチ毎に上昇または減少させる。

In [None]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def train_pred(model, train_X, train_y, val_X, val_y, epochs=2, callback=None):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y), callbacks = callback, verbose=0)
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_score = metrics.f1_score(val_y, (pred_val_y > 0.33).astype(int))
        print("Epoch: ", e, "-    Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    print('=' * 60)
    return pred_val_y, pred_test_y, best_score


def threshold_search(y_true, y_proba):
    """
    https://www.kaggle.com/ryanzhang/tfidf-naivebayes-logreg-baseline
    """
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Mainパート
### データロードと埋め込み

In [None]:
train_X, test_X, train_y, word_index = load_and_prec()

embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_2 = load_fasttext(word_index)
# embedding_matrix_3 = load_para(word_index)

embedding_matrix = np.mean([embedding_matrix_1], axis = 0)

### 学習

In [None]:
DATA_SPLIT_SEED = 2018
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

train_meta = np.zeros(train_y.shape)
test_meta = np.zeros(test_X.shape[0])
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_y))
for idx, (train_idx, valid_idx) in enumerate(splits):
        X_train = train_X[train_idx]
        y_train = train_y[train_idx]
        X_val = train_X[valid_idx]
        y_val = train_y[valid_idx]
        model = model_lstm_atten(embedding_matrix)
        pred_val_y, pred_test_y, best_score = train_pred(model, X_train, y_train, X_val, y_val, epochs = 8, callback = [clr,])
        train_meta[valid_idx] = pred_val_y.reshape(-1)
        test_meta += pred_test_y.reshape(-1) / len(splits)

f1_score(y_true=train_y, y_pred=train_meta > 0.33)

### 提出用ファイルの作成

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_meta > 0.33
sub.to_csv("submission.csv", index=False)