In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import sys
import gc
import glob
import os
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from utils import logger_func, pararell_process
logger = logger_func()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

import os
import time
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

2018-11-08 23:41:42,790 utils 353 [INFO]    [logger_func] start 
2018-11-08 23:41:42,790 utils 353 [INFO]    [logger_func] start 
Using TensorFlow backend.


In [3]:
train = utils.read_df_pkl('../input/train*.p')
test = utils.read_df_pkl('../input/test*.p')

100%|██████████| 3/3 [00:00<00:00,  7.32it/s]
100%|██████████| 3/3 [00:00<00:00, 182.63it/s]


In [4]:
## Split Train and Valid
train, valid = train_test_split(train, test_size=0.1, random_state=2018)

## some config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
x_train = train["question_text"].fillna("_na_").values
x_val = valid["question_text"].fillna("_na_").values
x_test = test["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

## Pad the sentences
x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

## Get the targe values
y_train = train["target"].values
y_val = valid["target"].values

In [5]:
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [6]:
## Train the model
model.fit(x_train, y_train, batch_size=512, epochs=2, validation_data=(x_val, y_val))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcc9749f278>

In [10]:
# y_pred_noemb_val = model.predict([x_val], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print(f"F1 score at threshold {thresh} is {metrics.f1_score(y_val, (y_pred_noemb_val>thresh).astype(int))}")

F1 score at threshold 0.1 is 0.5720461095100865
F1 score at threshold 0.11 is 0.5823106915657706
F1 score at threshold 0.12 is 0.5894745677354373
F1 score at threshold 0.13 is 0.5971300064253587
F1 score at threshold 0.14 is 0.6041939711664481
F1 score at threshold 0.15 is 0.6102418207681365
F1 score at threshold 0.16 is 0.6162596488060308
F1 score at threshold 0.17 is 0.6210405684162272
F1 score at threshold 0.18 is 0.6264498998462756
F1 score at threshold 0.19 is 0.6312355194099012
F1 score at threshold 0.2 is 0.6342140532261153
F1 score at threshold 0.21 is 0.6373275236020334
F1 score at threshold 0.22 is 0.6390086206896551
F1 score at threshold 0.23 is 0.6415374708999951
F1 score at threshold 0.24 is 0.6442408770085599
F1 score at threshold 0.25 is 0.6455171018012548
F1 score at threshold 0.26 is 0.6473385488571868
F1 score at threshold 0.27 is 0.6476239669421489
F1 score at threshold 0.28 is 0.6492720346501071
F1 score at threshold 0.29 is 0.6501867143533373
F1 score at threshold 

In [11]:
y_pred_noemb_test = model.predict([x_test], batch_size=1024, verbose=1)



In [12]:
del model, inp, x
gc.collect()

3558

In [26]:
EMBEDDING_FILE = "../model/glove.840B.300d/glove.840B.300d.txt"
## ここでの*arrはarrayをtubleで囲う役割になっている。listを解除する役割じゃなかったのか？謎
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE)) )

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
## どんな引数になってるんだ？
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [27]:
model.fit(x_train, y_train, batch_size=512, epochs=2, validation_data=(x_val, y_val))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcb43fe0d30>

In [30]:
y_pred_glove_val = model.predict([x_val], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print(f"F1 score at threshold {thresh} i {metrics.f1_score(y_val, (y_pred_glove_val>thresh).astype(int))}")

F1 score at threshold 0.1 i 0.5925895864615884
F1 score at threshold 0.11 i 0.6024676997216567
F1 score at threshold 0.12 i 0.6095173524301877
F1 score at threshold 0.13 i 0.6171058315334773
F1 score at threshold 0.14 i 0.6242632180874461
F1 score at threshold 0.15 i 0.6307128163849387
F1 score at threshold 0.16 i 0.6356328625278168
F1 score at threshold 0.17 i 0.6399705422074933
F1 score at threshold 0.18 i 0.6434766390002798
F1 score at threshold 0.19 i 0.6477901205388797
F1 score at threshold 0.2 i 0.6515412598123683
F1 score at threshold 0.21 i 0.6544416489258758
F1 score at threshold 0.22 i 0.6576320939334638
F1 score at threshold 0.23 i 0.6602316602316602
F1 score at threshold 0.24 i 0.6623701039168665
F1 score at threshold 0.25 i 0.664276697803585
F1 score at threshold 0.26 i 0.6670067853680934
F1 score at threshold 0.27 i 0.6683484349258649
F1 score at threshold 0.28 i 0.6697500389752118
F1 score at threshold 0.29 i 0.6719127059070401
F1 score at threshold 0.3 i 0.6725214263040

In [29]:
y_pred_glove_test = model.predict(x_test, batch_size=1024, verbose=1)



In [34]:
del word_index, embedding_index, all_embs, embedding_matrix, model, inp, x
gc.collect()

2559

In [None]:
EMBEDDING_FILE = "../model/wiki_news-300d-1M/wiki-news-300d-1M.vec"
## ここでの*arrはarrayをtubleで囲う役割になっている。listを解除する役割じゃなかったのか？謎
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
# len(0)>100って何？
embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE)) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1] # 列の次元

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
## どんな引数になってるんだ？
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(x_train, y_train, batch_size=512, epochs=2, validation_data=(x_val, y_val))

In [None]:
y_pred_fasttext_val = model.predict([x_val], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print(f"F1 score at threshold {thresh} is {metrics.f1_score(y_val, (y_pred_fasttext_val>thresh).astype(int))}")

In [None]:
y_pred_fasttext_test = model.predict([x_test], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

In [None]:
EMBEDDING_FILE = "../model/paragram_300_sl999/paragram_300_sl999.txt"
## ここでの*arrはarrayをtubleで囲う役割になっている。listを解除する役割じゃなかったのか？謎
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
# len(0)>100って何？
embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE)) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1] # 列の次元

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
## どんな引数になってるんだ？
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(x_train, y_train, batch_size=512, epochs=2, validation_data=(x_val, y_val))

In [None]:
y_pred_paragram_val = model.predict([x_val], batch_size=1024, verbose=11)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print(f"F1 score at threshold {thresh} is {metrics.f1_score(y_val, (y_pred_paragram_val>thresh).astype(int))}")

In [None]:
y_pred_paragram_test = model.predict([x_test], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
gc.collect()
time.sleep(10)

In [None]:
y_pred_val = 0.33*y_pred_glove_val + 0.33*y_pred_fasttext_val + 0.34+y_pred_paragram_val
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print(f"F1 score at threshold {thresh} is {metrics.f1_score(y_val, (y_pred_val>thresh).astype(int))}")

In [None]:
y_pred_test = 0.33*y_pred_glove_test + 0.33*y_pred_fasttext_test + 0.34*y_pred_paragram_test
y_pred_test = (y_pred_test>0.35).astype(int)
out_df = pd.DataFrame({"qid":test["qid"].values})
out_df["prediction"] = y_pred_test
out_df.to_csv("submission.csv", index=False)