In [1]:
import numpy as np
import pandas as pd
import datetime
import sys
import re
import gc
import os
import time
from contextlib import contextmanager
from tqdm import tqdm

import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
print(tf.test.is_built_with_cuda())

pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

    
# Pararell
from multiprocessing import Pool
import multiprocessing
def pararell_process(func, arg_list, cpu_cnt=multiprocessing.cpu_count()):
    process = Pool(cpu_cnt)
    callback = process.map(func, arg_list)
    process.close()
    process.terminate()
    return callback
    
# Global Variables
key = 'qid'
qt = 'question_text'
seed = 1208

# Load Data
with timer("Load Data"):
    pd.read_csv('../input/train.csv')

2018-11-17 11:06:45,902 utils 353 [INFO]    [logger_func] start 


In [4]:
#========================================================================
# Make Train Validation
# Tokenizer
#========================================================================

## some config values 
embed_size = 300 # how big is each word vector
# Current Best 30000
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

with timer("Make Train Vaidation Set & Tokenizer"):
    
    ## split to train and val
    train_df, val_df = train_test_split(tmp_train, test_size=0.2, random_state=seed)
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_na_").values
    val_X = val_df["question_text"].fillna("_na_").values
    # test_X = test_df["question_text"].fillna("_na_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    # test_X = tokenizer.texts_to_sequences(test_X)
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    # test_X = pad_sequences(test_X, maxlen=maxlen)
    
    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values

100%|██████████| 3/3 [00:00<00:00,  3.44it/s]


CPU times: user 2min 12s, sys: 1.12 s, total: 2min 13s
Wall time: 2min 13s


In [1]:
#========================================================================
# No PreTrain Model
#========================================================================
def no_pretrain_NN():
    with timer("Create No PreTrain Model"):
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size)(inp)
        # x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        print(model.summary())
        
    with timer("Model Fitting"):
        ## Train the model 
        model.fit(train_X, train_y, batch_size=512, epochs=2,
                  validation_data=(val_X, val_y)
                 )
        
    with timer("Prediction & Get F1 score"):
        pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
        del model, inp, x
        import gc; gc.collect()
        time.sleep(10)

In [25]:
#========================================================================
# Cross Validation
#========================================================================
with timer("Make Train Vaidation Set & Tokenizer"):
    
    ## fill up the missing values
    train_X = tmp_train["question_text"].fillna("_na_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    # test_X = tokenizer.texts_to_sequences(test_X)
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    # test_X = pad_sequences(test_X, maxlen=maxlen)
    
    ## Get the target values
    train_y = train_df['target'].values

    # KFold
    from sklearn.model_selection import StratifiedKFold
    if fold_type == 'stratified':
        folds = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)  # 1
        kfold = folds.split(train_X, train_y)

    for n_fold, (trn_idx, val_idx) in enumerate(kfold):
        x_train, x_val = train_X[train_idx], train_X[val_idx]
        y_train, y_val = train_y[train_idx], train_y[val_idx] 

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
Epoch 2/2
