# Imports

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
from bert.tokenization import bert_tokenization as tokenization
import tensorflow.keras.backend as K
from tensorflow import keras
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model
import gc

np.set_printoptions(suppress=True)

# Setup

## Setup Bert

In [2]:
BERT_PATH = '../data/bert_en_uncased_L-12_H-768_A-12/'

In [3]:
tokenizer = tokenization.FullTokenizer(BERT_PATH+'assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

## Load Data

In [4]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')

In [5]:
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

train shape = (6079, 41)
test shape = (476, 11)


In [6]:
output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])

In [7]:
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)


output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']


# Modules

## Padding Tokens (input_masks)

In [8]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

## Segment Tokens by [SEP] (input_segments)

In [9]:
def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

## Convert Tokens to IDs by Tokenizer (input_ids)

In [10]:
def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

## Trim Tokens

In [11]:
def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

## Conver to Bert inputs

In [12]:
def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

## Compute input array

In [13]:
def compute_input_arays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

## Compute output array

In [14]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

## Compute Spearman's rank correlation coefficient

In [15]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)

# Modeling

## Callback

In [16]:
class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if self.fold is not None:
            self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')
        
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size)
        )

## Create Model

In [17]:
def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model    

In [31]:
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold, times):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=None)
    save_callback = keras.callbacks.ModelCheckpoint(
        filepath='../saved_models/hist_'+str(times), monitor='loss',
        verbose=0, save_best_only=True,
        save_weights_only=False, mode='auto', period=1)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback, save_callback])
    
    return custom_callback, model

In [19]:
model = bert_model()

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_masks[0][0]            

## Input & Output

In [21]:
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

6079it [00:24, 244.38it/s]
476it [00:02, 235.06it/s]


In [22]:
np.array(inputs).shape

(3, 6079, 512)

## Train

In [41]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

AttributeError: module 'tensorflow_core.keras.backend' has no attribute 'set_session'

In [23]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [24]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body)

In [25]:
# histories = []
models = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold < 1:
        K.clear_session()
        
#         strategy = tf.distribute.MirroredStrategy()
#         with strategy.scope():
        print('''
        FOLD {}
        '''.format(fold+1))
        
        model = bert_model()

        train_inputs = [inputs[i][train_idx] for i in range(3)]
        train_outputs = outputs[train_idx]

        valid_inputs = [inputs[i][valid_idx] for i in range(3)]
        valid_outputs = outputs[valid_idx]

            # history contains two lists of valid and test preds respectively:
            #  [valid_predictions_{fold}, test_predictions_{fold}]


        history, model = train_and_predict(model, 
                                train_data=(train_inputs, train_outputs), 
                                valid_data=(valid_inputs, valid_outputs),
                                test_data=test_inputs, 
                                learning_rate=3e-5, epochs=5, batch_size=8,
                                loss_function='binary_crossentropy', fold=fold, times=fold+1)

#         histories.append(history)
        models.append(model)
        del history, model, train_inputs, train_outputs
        gc.collect()


        FOLD 1
        
Train on 5471 samples
Epoch 1/5

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)



validation rho: nan
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../saved_models/hist_1/assets
Epoch 2/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_1/assets
Epoch 3/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_1/assets
Epoch 4/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_1/assets
Epoch 5/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_1/assets

        FOLD 2
        
Train on 5471 samples
Epoch 1/5
validation rho: 0.3589
INFO:tensorflow:Assets written to: ../saved_models/hist_2/assets
Epoch 2/5
validation rho: 0.3835
INFO:tensorflow:Assets written to: ../saved_models/hist_2/assets
Epoch 3/5
validation rho: 0.3922
INFO:tensorflow:Assets written to: ../saved_models/hist_2/assets
Epoch 4/5
validation rho: 0.3937
INFO:tensorflow:Assets written to: ../saved_models/hist_2/assets
Epoch 5/5
vali

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)



validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_3/assets
Epoch 2/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_3/assets
Epoch 3/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_3/assets
Epoch 4/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_3/assets
Epoch 5/5
validation rho: nan
INFO:tensorflow:Assets written to: ../saved_models/hist_3/assets

        FOLD 4
        
Train on 5471 samples
Epoch 1/5
validation rho: 0.3415
INFO:tensorflow:Assets written to: ../saved_models/hist_4/assets
Epoch 2/5
validation rho: 0.3687
INFO:tensorflow:Assets written to: ../saved_models/hist_4/assets
Epoch 3/5
validation rho: 0.3768
INFO:tensorflow:Assets written to: ../saved_models/hist_4/assets
Epoch 4/5
validation rho: 0.3789
INFO:tensorflow:Assets written to: ../saved_models/hist_4/assets
Epoch 5/5
validation rho: 0.3796
INFO:tensorflow:Assets written to: ../saved_models/hist_4/asse

ResourceExhaustedError:  OOM when allocating tensor with shape[8,12,512,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_model/StatefulPartitionedCall/encoder/layer_9/self_attention/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_distributed_function_498087]

Function call stack:
distributed_function


In [26]:
model  = load_model('../saved_models/hist_1/')

In [30]:
histories[0].

[array([[0.93885964, 0.7017418 , 0.18100294, ..., 0.06279597, 0.7723955 ,
         0.9296403 ],
        [0.8754618 , 0.47726217, 0.00758111, ..., 0.10243189, 0.2847627 ,
         0.8796195 ],
        [0.91833085, 0.69713837, 0.02619708, ..., 0.05620596, 0.8415642 ,
         0.92469203],
        ...,
        [0.8858343 , 0.47375554, 0.02602929, ..., 0.09295341, 0.6077235 ,
         0.90285444],
        [0.88769686, 0.7654049 , 0.02044135, ..., 0.08904704, 0.8563958 ,
         0.9194747 ],
        [0.8934441 , 0.6397373 , 0.0095886 , ..., 0.07139063, 0.16714466,
         0.85258466]], dtype=float32),
 array([[0.9632395 , 0.7328137 , 0.32531688, ..., 0.05174953, 0.6931189 ,
         0.92743355],
        [0.9090768 , 0.47751898, 0.00903359, ..., 0.14739463, 0.08779311,
         0.88616794],
        [0.9449729 , 0.6153854 , 0.04871514, ..., 0.03700495, 0.7955936 ,
         0.9197533 ],
        ...,
        [0.87779236, 0.35077924, 0.01508176, ..., 0.11516279, 0.48663783,
         0.89601076