### 1. What is the purpose of this experiment?
WordNetで各テキストのlemmaを取得し, そのlemmaのみで学習モデルを作ることで, 多様性を持ったモデルができるか確認する
### 2. Why do you this?
WordNetで取得できる類語は, オリジナルテキストと違った特徴を持ち, モデルの多様性を作るのに有効なのではと思った.
### 3. Where are the points of technology and techniques?
nltk WordNet
### 4. How do you validate the effectiveness?
CV & LB  
lemmaのみのデータセットで精度はでない為、これによる各validationの予測値をstackして特徴量を作る。それをLightGBMとNNに追加し、CVを見る。まずはtrain test splitでOK.
### 5. What will you do next?
spacy entity

In [2]:
import numpy as np
import pandas as pd
import datetime
import sys
import re
import gc
import glob
import os
import time
from contextlib import contextmanager
from tqdm import tqdm

import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
print(tf.test.is_built_with_cuda())

pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

    
from multiprocessing import Pool
import multiprocessing
def pararell_process(func, arg_list, cpu_cnt=multiprocessing.cpu_count()):
    process = Pool(cpu_cnt)
    callback = process.map(func, arg_list).get(600)
    process.close()
    process.terminate()
    return callback
    

key = 'qid'
qt = 'question_text'
seed = 1208

with timer("Load Data"):
    pd.read_csv('../input/train.csv')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


True


In [3]:
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, pararell_process
logger = logger_func()
train = utils.read_df_pkl('../input/train*.p')

2018-11-17 22:24:52,443 utils 353 [INFO]    [logger_func] start 
100%|██████████| 3/3 [00:01<00:00,  2.45it/s]


### WordNet

In [9]:
import nltk
from nltk.corpus import wordnet as wn
from nltk import word_tokenize

def get_wordnet_similarity_path_wup(w1, w2):
    w1_sets = wn.synsets(w1)
    w2_sets = wn.synsets(w2)
    path_val = []
    wup_val = []
    for w1, w2 in zip(w1_sets, w2_sets):
        path_val.append(w1.path_similarity(w2))
        wup_val.append(w1.wup_similarity(w2))
    return np.max(path_val), np.max(wup_val)


#========================================================================
# Get lemma List
#========================================================================
def get_wordnet_lemma(word):
    lemma_list = []
    for w in wn.synsets(word):
        lemma_list+=w.lemma_names()
    return list(set(lemma_list) - set([word]))


# まずは全てのlemmaを取得する
id_list = train[key]
qt_list = train[qt]
def pararell_get_lemma(args):
    uid = args[0]
    doc = args[1]
    word_list = doc.split()
    tmp_dict = {}
    lemma_list = []
    for word in word_list:
        lemma_list += get_wordnet_lemma(word)
    if original:
        tmp_dict[uid] = word_list + lemma_list
    else:
        tmp_dict[uid] = lemma_list
    return tmp_dict
        
with timer("Get lemma from Wordnet"):
    p_list = pararell_process(pararell_get_lemma, zip(id_list, qt_list))
with timer("Make lemma dataset"):
    lemma_dict = {}
    [lemma_dict.update(p) for p in p_list]

# 取得したlemmmaの一部を残したデータセットにする
# これをいくつかのseedでSamplingし，多様な予測値を作ってstackingする
def pararell_val_join(args):
    uid = args[0]
    value = args[1]
    df_dict = {}
    np.random.seed(seed)
    val_len = int(len(value)*0.5)
    try:
        value = np.random.choice(a=value, size=val_len)
    except ValueError:
        pass
    df_dict[uid] = " ".join(value)
    return df_dict


orginal=False

# lemma Sampling
with timer("lemma sampling"):
    p_list = pararell_process(pararell_val_join, lemma_dict.items())
with timer("Make lemma dataset"):
    tmp_dict = {}
    [tmp_dict.update(p) for p in p_list]

# lemma dictをDataFrameにする  
with timer("Make lemma DF"):
    to_df_dict = tmp_dict
    # Sampling済のデータセットをDFにする 
    tmp_train = pd.Series(to_df_dict).to_frame()
    tmp_train = tmp_train.join(train.set_index('qid'))
    tmp_train.rename(columns={0:qt}, inplace=True)

[Make lemma DF] done in 5 s


In [4]:
#========================================================================
# Make Train Validation
# Tokenizer
#========================================================================

## some config values 
embed_size = 300 # how big is each word vector
# Current Best 30000
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

with timer("Make Train Vaidation Set & Tokenizer"):
    
    ## split to train and val
    train_df, val_df = train_test_split(tmp_train, test_size=0.2, random_state=seed)
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_na_").values
    val_X = val_df["question_text"].fillna("_na_").values
    # test_X = test_df["question_text"].fillna("_na_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    # test_X = tokenizer.texts_to_sequences(test_X)
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    # test_X = pad_sequences(test_X, maxlen=maxlen)
    
    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values

100%|██████████| 3/3 [00:00<00:00,  3.44it/s]


CPU times: user 2min 12s, sys: 1.12 s, total: 2min 13s
Wall time: 2min 13s


In [24]:
#========================================================================
# No PreTrain Model
#========================================================================
def no_pretrain_NN():
    with timer("Create No PreTrain Model"):
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size)(inp)
        # x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        print(model.summary())
        
    with timer("Model Fitting"):
        ## Train the model 
        model.fit(train_X, train_y, batch_size=512, epochs=2,
                  validation_data=(val_X, val_y)
                 )
        
    with timer("Prediction & Get F1 score"):
        pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
        del model, inp, x
        import gc; gc.collect()
        time.sleep(10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 100, 300)          6000000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100, 128)          187392    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                2064      
_________________________________________________________________
dropout_6 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 17        
Total para

In [5]:
#========================================================================
# Glove PreTrain Model
#========================================================================
def glove_pretrain_NN():
    with timer("Get Glove PreTrain Grad"):
        EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
        
        all_embs = np.stack(embeddings_index.values())
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        embed_size = all_embs.shape[1]
        
        word_index = tokenizer.word_index
        nb_words = min(max_features, len(word_index))
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        for word, i in word_index.items():
            if i >= max_features: continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    with timer("Create Glove PreTrain Model"):
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
        x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        print(model.summary())
        
    with timer("Model Fitting"):
        model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))
        
    with timer("Prediction & Get F1 score"):
        pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          9000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [None]:
with timer("Make Train Vaidation Set & Tokenizer"):
    
    ## fill up the missing values
    train_X = tmp_train["question_text"].fillna("_na_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    # test_X = tokenizer.texts_to_sequences(test_X)
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    # test_X = pad_sequences(test_X, maxlen=maxlen)
    
    ## Get the target values
    train_y = train_df['target'].values

    # KFold
    from sklearn.model_selection import StratifiedKFold
    if fold_type == 'stratified':
        folds = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)  # 1
        kfold = folds.split(train_X, train_y)

    for n_fold, (trn_idx, val_idx) in enumerate(kfold):
        x_train, x_val = train_X[train_idx], train_X[val_idx]
        y_train, y_val = train_y[train_idx], train_y[val_idx] 