# Acknowledgements

Oiginal kernel: https://www.kaggle.com/akensert/bert-base-tf2-0-minimalistic

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil

np.set_printoptions(suppress=True)

#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [2]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

train shape = (6079, 41)
test shape = (476, 11)

output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']


In [3]:
data = pd.concat([df_train[['question_title','question_body','answer','url','category']],df_test[['question_title','question_body','answer','url','category']]],axis=0)

import re
from urllib.parse import urlparse

find = re.compile(r"^[^.]*")

data['subcategory'] = data['url'].apply(lambda x: re.findall(find, urlparse(x.lower()).netloc)[0])
data['category'] = data['category'].str.lower()
data = pd.get_dummies(data, columns=['subcategory','category'])
data.head(5)

Unnamed: 0,question_title,question_body,answer,url,subcategory_academia,subcategory_android,subcategory_anime,subcategory_apple,subcategory_askubuntu,subcategory_bicycles,...,subcategory_unix,subcategory_ux,subcategory_webapps,subcategory_webmasters,subcategory_wordpress,category_culture,category_life_arts,category_science,category_stackoverflow,category_technology
0,What am I losing when using extension tubes in...,After playing around with macro photography on...,"I just got extension tubes, so here's the skin...",http://photo.stackexchange.com/questions/9169/...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,It might be helpful to look into the definitio...,http://rpg.stackexchange.com/questions/47820/w...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Do you even need grooves? We make several pro...,http://electronics.stackexchange.com/questions...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...","Sending an ""affidavit"" it is a dispute between...",http://judaism.stackexchange.com/questions/551...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,Check out Image Trace in Adobe Illustrator. \n...,http://graphicdesign.stackexchange.com/questio...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
import gensim
from nltk.corpus import stopwords

from scipy.stats import skew, kurtosis, spearmanr
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')

from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer

import string

In [5]:
data['len_q1'] = data.question_title.apply(lambda x: len(str(x)))
data['len_q2'] = data.question_body.apply(lambda x: len(str(x)))
data['len_a'] = data.answer.apply(lambda x: len(str(x)))
data['diff_len_q'] = data.len_q2 - data.len_q1
data['diff_len_q_frac'] = data['diff_len_q']/data.len_q2

data['diff_len_a1'] = data.len_a - data.len_q1
data['diff_len_a2'] = data.len_a - data.len_q2
data['diff_len_frac_a2'] = data['diff_len_a2']/data['len_a']

data['len_word_q1'] = data.question_title.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question_body.apply(lambda x: len(str(x).split()))
data['len_word_frac_q2'] = data['len_word_q1']/data['len_word_q2']
data['len_word_a'] = data.answer.apply(lambda x: len(str(x).split()))
data['len_word_frac_a'] = data['len_word_q2']/data['len_word_a']

data['common_words_q'] = data.apply(lambda x: len(set(str(x['question_title']).lower().split()).intersection(set(str(x['question_body']).lower().split()))), axis=1)
data['common_words_frac_q'] = data['common_words_q']/data.len_word_q1
data['common_words_frac2_q'] = data['common_words_q']/data.len_word_q2
data['common_words_a1'] = data.apply(lambda x: len(set(str(x['question_title']).lower().split()).intersection(set(str(x['answer']).lower().split()))), axis=1)
data['common_words_a2'] = data.apply(lambda x: len(set(str(x['answer']).lower().split()).intersection(set(str(x['question_body']).lower().split()))), axis=1)
data['common_words_frac_a2'] = data['common_words_a2']/data['len_word_a']
data['common_words_frac2_a2'] = data['common_words_a2']/data['len_word_q2']

In [6]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
st = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def clean(data):
    tokens = tokenizer.tokenize(data.lower())
    stop_free = " ".join([st.stem(i) for i in tokens if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

data['clean_question_title'] = data.apply(lambda row :clean(row['question_title']),axis=1)
data['clean_question_title'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_title']),axis=1)

data['clean_question_body'] = data.apply(lambda row :clean(row['question_body']),axis=1)
data['clean_question_body'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_body']),axis=1)

data['clean_answer'] = data.apply(lambda row :clean(row['answer']),axis=1)
data['clean_answer'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_answer']),axis=1)

data['question_title_wordlen'] = data.clean_question_title.apply(lambda x: len(x.split()))
data['question_body_wordlen'] = data.clean_question_body.apply(lambda x: len(x.split()))
data['answer_wordlen'] = data.clean_answer.apply(lambda x: len(x.split()))

In [7]:
def count_question_words(x):
    count = 0
    count += x.count("?")
    for word in x.lower().split():
        if word.startswith("wh") or word.startswith("how"):
            count += 1
    return count

def count_compare_words(x):
    count = 0
    count += x.lower().split().count("or")
    for word in x.lower().split():
        if word.startswith("distinct") or word.startswith("between") or "advantage" in word or word.startswith("vs") or word.startswith("differ") :
            count += 1
    return count

def count_consequence_words(x):
    if x.startswith("if") or x.startswith("is"):
        return 1
    else:
        return 0
    return count

data["q_count_title"] = data.question_title.apply(count_question_words)
data["q_count_body"] = data.question_body.apply(count_question_words)

data["compare_count_title"] = data.question_title.apply(count_compare_words)
data["compare_count_body"] = data.question_body.apply(count_compare_words)

data["consq_count_title"] = data.question_title.apply(count_consequence_words)
data["consq_count_body"] = data.question_body.apply(count_consequence_words)


In [8]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
import scipy

lda1 = LatentDirichletAllocation(n_components=20)
lda2 = LatentDirichletAllocation(n_components=20)

cv1 = CountVectorizer(max_df=.7,min_df=5,max_features=50000)

answer_vector = cv1.fit_transform(data.clean_answer)
title_vector = cv1.transform(data.clean_question_title)
body_vector = cv1.transform(data.clean_question_body)

cv2 = CountVectorizer(max_df=.7,min_df=5,max_features=50000)

body_vector2 = cv2.fit_transform(data.clean_question_body)
answer_vector2 = cv2.transform(data.clean_answer)
title_vector2 = cv2.transform(data.clean_question_title)

answer_topics = lda1.fit_transform(answer_vector)
title_topics = lda1.transform(title_vector)
body_topics = lda1.transform(body_vector)

body_topics2 = lda2.fit_transform(body_vector2)
answer_topics2 = lda2.transform(answer_vector2)
title_topics2 = lda2.transform(title_vector2)


title_topic_entropy = scipy.stats.entropy(title_topics.T)
body_topic_entropy = scipy.stats.entropy(body_topics.T)
answer_topic_entropy = scipy.stats.entropy(answer_topics.T)

title_topic_entropy2 = scipy.stats.entropy(title_topics2.T)
body_topic_entropy2 = scipy.stats.entropy(body_topics2.T)
answer_topic_entropy2 = scipy.stats.entropy(answer_topics2.T)

#document_topic_entropy_len_normalized = document_topic_entropy * np.sqrt(word_len/2)
def geometric_mean(x):
    x = [i for i in x if i!=0]
    if len(x) > 0:
        return scipy.stats.mstats.gmean(x)
    else:
        return 0

def popularity(matrix):
    matrix = matrix.toarray()
    idf_matrix = (matrix > 0).astype(int)
    word_freq = idf_matrix.sum(axis=0)
    idf_matrix = idf_matrix * word_freq
    idf_matrix = idf_matrix * 1.0/idf_matrix.shape[0]

    document_popularity = np.array([geometric_mean(x) for x in idf_matrix.tolist()])
    return document_popularity

title_popularity = popularity(title_vector)
body_popularity = popularity(body_vector)
answer_popularity = popularity(answer_vector)

title_popularity2 = popularity(title_vector2)
body_popularity2 = popularity(body_vector2)
answer_popularity2 = popularity(answer_vector2)

data['title_entropy'] = title_topic_entropy
data['body_entropy'] = body_topic_entropy
data['answer_entropy'] = answer_topic_entropy

data['title_entropy2'] = title_topic_entropy2
data['body_entropy2'] = body_topic_entropy2
data['answer_entropy2'] = answer_topic_entropy2

data['title_popularity'] = title_popularity
data['body_popularity'] = body_popularity
data['answer_popularity'] = answer_popularity

data['title_popularity2'] = title_popularity2
data['body_popularity2'] = body_popularity2
data['answer_popularity2'] = answer_popularity2

nmf1 = NMF(n_components=20)
nmf2 = NMF(n_components=20)

nmf_ans = nmf1.fit_transform(answer_vector)
nmf_title = nmf1.transform(title_vector)
nmf_body = nmf1.transform(body_vector)

nmf_body2 = nmf2.fit_transform(body_vector2)
nmf_title2 = nmf2.transform(title_vector2)
nmf_ans2 = nmf2.transform(answer_vector2)

In [9]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

for col in data:
    if data[col].isna().any():
        print (col)
        if 'popularity' in col or 'frac' in col:
            data[col] = data[col].fillna(0)
        elif 'distance' in col:
            data[col] = data[col].fillna(1)
        else:
            data[col] = data[col].fillna(-99)

len_word_frac_q2
common_words_frac2_q
common_words_frac2_a2


In [10]:
new_features = list(data.columns)[5:]
new_features.remove('clean_question_title')
new_features.remove('clean_question_body')
new_features.remove('clean_answer')

for col in new_features:
    if data[col].isna().any():
        print (col)

In [11]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

new_X = np.hstack([mm.fit_transform(data[new_features]), nmf_ans, nmf_ans2, nmf_body, nmf_body2, nmf_title, nmf_body2, body_topics, body_topics2, title_topics, title_topics2, answer_topics, answer_topics2])
print (new_X.shape)

(6555, 344)


In [12]:
train_Xnew = new_X[:df_train.shape[0]]
test_Xnew = new_X[df_train.shape[0]:]

print (train_Xnew.shape,test_Xnew.shape)

(6079, 344) (476, 344)


#### 2. Preprocessing functions

These are some functions that will be used to preprocess the raw text data into useable Bert inputs.

In [13]:
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)

In [14]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

#### 3. Create model

`compute_spearmanr()` is used to compute the competition metric for the validation set
<br><br>
`CustomCallback()` is a class which inherits from `tf.keras.callbacks.Callback` and will compute and append validation score and validation/test predictions respectively, after each epoch.
<br><br>
`bert_model()` contains the actual architecture that will be used to finetune BERT to our dataset. It's simple, just taking the sequence_output of the bert_layer and pass it to an AveragePooling layer and finally to an output layer of 30 units (30 classes that we have to predict)
<br><br>
`train_and_predict()` this function will be run to train and obtain predictions

In [15]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        self.best_value = 0
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if rho_val > self.best_value:
            self.best_value = rho_val
            print ("Model saved")
            self.model.save_weights('bert-base-{}.h5'.format(self.fold))
        
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size)
        )

def bert_model1():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    input_additional_features = tf.keras.layers.Input(
        (new_X.shape[1],), dtype=tf.float32, name='additional_features')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Concatenate(-1)([x,input_additional_features])
    x = tf.keras.layers.Dense(100, activation="relu", name="dense_features")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments, input_additional_features], outputs=out)
    
    return model 

def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model 
        
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=fold)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback


#### 4. Obtain inputs and targets, as well as the indices of the train/validation splits

In [16]:
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

new_test_inputs = test_inputs + [test_Xnew]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### 5. Training, validation and testing

Loops over the folds in gkf and trains each fold for 5 epochs --- with a learning rate of 1e-5 and batch_size of 8. A simple binary crossentropy is used as the objective-/loss-function. 

In [17]:
model = bert_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_masks[0][0]            

In [18]:
uniq_numbers = np.unique(df_train[output_categories].values.flatten())
print (uniq_numbers)

def rounder(values):
    def f(x):
        idx = np.argmin(np.abs(values - x))
        return values[idx]
    return np.frompyfunc(f, 1, 1)

[0.         0.2        0.26666667 0.3        0.33333333 0.33333333
 0.4        0.44444444 0.46666667 0.5        0.53333333 0.55555556
 0.6        0.66666667 0.66666667 0.7        0.73333333 0.77777778
 0.8        0.83333333 0.86666667 0.88888889 0.9        0.93333333
 1.        ]


In [19]:
import keras.backend as K

def abs_KL_div(y_true, y_pred):
    y_true = K.clip(y_true, K.epsilon(), None)
    y_pred = K.clip(y_pred, K.epsilon(), None)
    return K.sum( K.abs( (y_true- y_pred) * (K.log(y_true / y_pred))), axis=-1)


Using TensorFlow backend.


In [20]:
def noisy_bce(y_true, y_pred):
    y_true = K.clip(y_true, K.epsilon(), None)
    y_pred = K.clip(y_pred, K.epsilon(), None)
    return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)

In [21]:
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [22]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body) ############## originaln_splits=5

histories = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold in [5,6,7]:
        K.clear_session()
        model = bert_model()
        
        train_inputs = [inputs[i][train_idx] for i in range(3)] #+ [train_Xnew[train_idx]]
        train_outputs = outputs[train_idx]

        valid_inputs = [inputs[i][valid_idx] for i in range(3)] #+ [train_Xnew[valid_idx]]
        valid_outputs = outputs[valid_idx]

        # history contains two lists of valid and test preds respectively:
        #  [valid_predictions_{fold}, test_predictions_{fold}]
        history = train_and_predict(model, 
                          train_data=(train_inputs, train_outputs), 
                          valid_data=(valid_inputs, valid_outputs),
                          test_data=test_inputs, 
                          learning_rate=3e-5, epochs=5, batch_size=8,
                          loss_function=noisy_bce, fold=fold) #binary_crossentropy

        model.load_weights('bert-base-{}.h5'.format(fold))
        val_pred = model.predict(valid_inputs)
        rounded_val_pred = np.array([rounder(uniq_numbers)(i) for i in val_pred])
        
        histories.append(history)

Train on 5471 samples
Epoch 1/5
validation rho: 0.3500
Model saved
Epoch 2/5
validation rho: 0.3687
Model saved
Epoch 3/5
validation rho: 0.3788
Model saved
Epoch 4/5
validation rho: 0.3787
Epoch 5/5
validation rho: 0.3768
Train on 5471 samples
Epoch 1/5
validation rho: 0.3547
Model saved
Epoch 2/5
validation rho: 0.3800
Model saved
Epoch 3/5
validation rho: 0.3870
Model saved
Epoch 4/5
validation rho: 0.3906
Model saved
Epoch 5/5
validation rho: 0.3924
Model saved
Train on 5471 samples
Epoch 1/5
validation rho: 0.3769
Model saved
Epoch 2/5
validation rho: 0.3944
Model saved
Epoch 3/5
validation rho: 0.4025
Model saved
Epoch 4/5
validation rho: 0.4027
Model saved
Epoch 5/5
validation rho: 0.4027
Model saved


#### 6. Process and submit test predictions

First the test predictions are read from the list of lists of `histories`. Then each test prediction list (in lists) is averaged. Then a mean of the averages is computed to get a single prediction for each data point. Finally, this is saved to `submission.csv`

In [23]:
val_pred2 = [histories[i].valid_predictions for i in range(len(histories))]
val_pred2 = [np.average(val_pred2[i], axis=0) for i in range(len(val_pred2))]
val_pred2 = np.mean(val_pred2, axis=0)

In [24]:
uniq_numbers_per_class = [np.unique(df_train[output_categories].values[:,i]) for i in range(len(output_categories))]
                                    
rounded_val_pred = np.array([rounder(uniq_numbers_per_class[i])(val_pred[:,i]) for i in range(len(output_categories))]).T
rounded_val_pred2 = np.array([rounder(uniq_numbers)(i) for i in val_pred])

for i in range(len(output_categories)):
    if len(np.unique(rounded_val_pred[:,i])) == 1:
        rounded_val_pred[:,i] = val_pred[:,i].copy()
        
for i in range(len(output_categories)):
    if len(np.unique(rounded_val_pred2[:,i])) == 1:
        rounded_val_pred2[:,i] = val_pred[:,i].copy()
        
score1 = 0
score2 = 0
score3 = 0

for col_ind, col in enumerate(output_categories):
    score1 += spearmanr(valid_outputs[:,col_ind],val_pred[:,col_ind]).correlation
    score2 += spearmanr(valid_outputs[:,col_ind],rounded_val_pred[:,col_ind]).correlation
    score3 += spearmanr(valid_outputs[:,col_ind],rounded_val_pred2[:,col_ind]).correlation
    print ("{} oof spearman correlation {} , {} and {}".format(col, spearmanr(valid_outputs[:,col_ind],val_pred[:,col_ind]).correlation, spearmanr(valid_outputs[:,col_ind],rounded_val_pred[:,col_ind]).correlation, spearmanr(valid_outputs[:,col_ind],rounded_val_pred2[:,col_ind]).correlation))

print ("overall scores {}, {} and {}".format(score1/30, score2/30, score3/30))
        

question_asker_intent_understanding oof spearman correlation 0.28238634651862016 , 0.2828239288696029 and 0.2551552540128391
question_body_critical oof spearman correlation 0.4965524775465909 , 0.4934486408373993 and 0.4970766545598862
question_conversational oof spearman correlation 0.40706743145466057 , 0.42206197149912067 and 0.4492477828361911
question_expect_short_answer oof spearman correlation 0.2754949071461538 , 0.2618787016338228 and 0.273281604711095
question_fact_seeking oof spearman correlation 0.3511553336139033 , 0.33544343988248837 and 0.3494184905160427
question_has_commonly_accepted_answer oof spearman correlation 0.4513086343581561 , 0.5268123153884202 and 0.4759619221317155
question_interestingness_others oof spearman correlation 0.2176276607299437 , 0.21574494063098856 and 0.20341272064425284
question_interestingness_self oof spearman correlation 0.4991028536014139 , 0.4909406483385501 and 0.500384807845734
question_multi_intent oof spearman correlation 0.522495960

In [25]:
uniq_numbers_per_class = [np.unique(df_train[output_categories].values[:,i]) for i in range(len(output_categories))]
                                    
rounded_val_pred = np.array([rounder(uniq_numbers_per_class[i])(val_pred2[:,i]) for i in range(len(output_categories))]).T
rounded_val_pred2 = np.array([rounder(uniq_numbers)(i) for i in val_pred2])

for i in range(len(output_categories)):
    if len(np.unique(rounded_val_pred[:,i])) == 1:
        rounded_val_pred[:,i] = val_pred2[:,i].copy()
        
for i in range(len(output_categories)):
    if len(np.unique(rounded_val_pred2[:,i])) == 1:
        rounded_val_pred2[:,i] = val_pred2[:,i].copy()
        
score1 = 0
score2 = 0
score3 = 0

for col_ind, col in enumerate(output_categories):
    score1 += spearmanr(valid_outputs[:,col_ind],val_pred2[:,col_ind]).correlation
    score2 += spearmanr(valid_outputs[:,col_ind],rounded_val_pred[:,col_ind]).correlation
    score3 += spearmanr(valid_outputs[:,col_ind],rounded_val_pred2[:,col_ind]).correlation
    print ("{} oof spearman correlation {} , {} and {}".format(col, spearmanr(valid_outputs[:,col_ind],val_pred[:,col_ind]).correlation, spearmanr(valid_outputs[:,col_ind],rounded_val_pred[:,col_ind]).correlation, spearmanr(valid_outputs[:,col_ind],rounded_val_pred2[:,col_ind]).correlation))

print ("overall scores {}, {} and {}".format(score1/30, score2/30, score3/30))
        

question_asker_intent_understanding oof spearman correlation 0.28238634651862016 , 0.15356777903665791 and 0.2085662773424169
question_body_critical oof spearman correlation 0.4965524775465909 , 0.2833624161715703 and 0.2822657729492392
question_conversational oof spearman correlation 0.40706743145466057 , 0.12818692811443738 and 0.23374002138877956
question_expect_short_answer oof spearman correlation 0.2754949071461538 , 0.10777937696187478 and 0.10984585635748093
question_fact_seeking oof spearman correlation 0.3511553336139033 , 0.18628374694985492 and 0.2691750472515131
question_has_commonly_accepted_answer oof spearman correlation 0.4513086343581561 , 0.2929005133155003 and 0.3057651661543264
question_interestingness_others oof spearman correlation 0.2176276607299437 , 0.13751429350993563 and 0.15391894997653444
question_interestingness_self oof spearman correlation 0.4991028536014139 , 0.2914183656099262 and 0.30067074190993565
question_multi_intent oof spearman correlation 0.52

In [26]:
test_predictions = [histories[i].test_predictions for i in range(len(histories))]
test_predictions = [np.average(test_predictions[i], axis=0) for i in range(len(test_predictions))]
test_predictions = np.mean(test_predictions, axis=0)

In [27]:
rounded_test_predictions = np.array([rounder(uniq_numbers)(i) for i in test_predictions])

In [28]:
rounded_test_predictions.shape

(476, 30)

In [29]:
for i in range(30):
    print (i, rounded_test_predictions[:,i].sum())

0 418.8222222222225
1 274.5777777777774
2 7.766666666666669
3 337.6888888888886
4 389.93333333333356
5 406.5444444444448
6 278.2999999999984
7 233.91111111111167
8 106.85555555555571
9 0.0
10 183.46666666666704
11 129.05555555555551
12 7.077777777777779
13 1.7999999999999998
14 5.933333333333334
15 11.966666666666665
16 251.44444444444437
17 80.56666666666693
18 188.6888888888888
19 0.0
20 372.34444444444466
21 439.53333333333416
22 317.5222222222216
23 455.53333333333427
24 468.2000000000004
25 411.65555555555596
26 251.4555555555554
27 70.53333333333372
28 244.28888888888872
29 432.1555555555558


In [30]:
for i, col in enumerate(output_categories):
    if len(np.unique(rounded_test_predictions[:,i])) == 1:  #or col in ['question_not_really_a_question','question_type_consequence','answer_helpful','answer_plausible','answer_relevance','answer_well_written']:
        rounded_test_predictions[:,i] = test_predictions[:,i].copy()
        
    print (i, rounded_test_predictions[:,i].sum())

0 418.8222222222225
1 274.5777777777774
2 7.766666666666669
3 337.6888888888886
4 389.93333333333356
5 406.5444444444448
6 278.2999999999984
7 233.91111111111167
8 106.85555555555571
9 3.609214697731659
10 183.46666666666704
11 129.05555555555551
12 7.077777777777779
13 1.7999999999999998
14 5.933333333333334
15 11.966666666666665
16 251.44444444444437
17 80.56666666666693
18 188.6888888888888
19 1.4992365323705599
20 372.34444444444466
21 439.53333333333416
22 317.5222222222216
23 455.53333333333427
24 468.2000000000004
25 411.65555555555596
26 251.4555555555554
27 70.53333333333372
28 244.28888888888872
29 432.1555555555558


In [31]:
df_sub.iloc[:, 1:] = rounded_test_predictions

In [32]:
df_sub.to_csv('submission.csv', index=False)

In [33]:
pd.options.display.max_columns=999

In [34]:
df_sub.head(5)

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.933333,0.666667,0.2,0.466667,0.6,0.6,0.666667,0.666667,0.6,0.008313,0.7,0.7,0.0,0.2,0.0,0.0,0.2,0.0,0.8,0.004053,0.9,0.888889,0.6,0.933333,0.933333,0.777778,0.0,0.0,0.833333,0.933333
1,46,0.888889,0.533333,0.0,0.8,0.777778,0.933333,0.555556,0.444444,0.0,0.005243,0.4,0.444444,0.0,0.0,0.0,0.0,0.866667,0.2,0.0,0.00175,0.7,0.933333,0.666667,1.0,1.0,0.9,0.933333,0.2,0.0,0.9
2,70,0.9,0.6,0.0,0.777778,0.866667,0.933333,0.6,0.466667,0.2,0.005132,0.3,0.666667,0.0,0.0,0.0,0.0,0.2,0.0,0.7,0.002477,0.833333,0.933333,0.6,0.933333,1.0,0.833333,0.2,0.0,0.833333,0.9
3,132,0.866667,0.444444,0.0,0.733333,0.733333,0.933333,0.555556,0.444444,0.0,0.011945,0.533333,0.0,0.0,0.0,0.0,0.0,0.833333,0.2,0.533333,0.002448,0.666667,0.933333,0.666667,1.0,1.0,0.9,0.866667,0.2,0.5,0.888889
4,200,0.933333,0.444444,0.0,0.8,0.733333,0.8,0.666667,0.6,0.0,0.010255,0.5,0.2,0.0,0.0,0.0,0.0,0.266667,0.2,0.5,0.002517,0.7,0.9,0.666667,0.933333,0.933333,0.833333,0.2,0.2,0.666667,0.9


In [35]:
df_sub.describe()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,5029.186975,0.879879,0.576844,0.016317,0.70943,0.819188,0.854085,0.584664,0.49141,0.224486,0.007582,0.385434,0.271125,0.014869,0.003782,0.012465,0.02514,0.528245,0.169258,0.396405,0.00315,0.782236,0.923389,0.667063,0.957003,0.983613,0.864823,0.528268,0.148179,0.513212,0.90789
std,2812.67006,0.047424,0.128354,0.061463,0.111886,0.09223,0.118733,0.050854,0.084471,0.205989,0.003374,0.162842,0.302404,0.077476,0.030199,0.066164,0.092803,0.338325,0.102713,0.2751,0.001169,0.089065,0.023109,0.046987,0.031935,0.028734,0.038993,0.32897,0.096483,0.283256,0.022081
min,39.0,0.733333,0.333333,0.0,0.266667,0.333333,0.3,0.5,0.333333,0.0,0.002687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001585,0.555556,0.833333,0.533333,0.933333,0.933333,0.733333,0.0,0.0,0.0,0.833333
25%,2572.0,0.833333,0.466667,0.0,0.666667,0.777778,0.833333,0.555556,0.444444,0.0,0.005407,0.266667,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.002395,0.7,0.9,0.666667,0.933333,1.0,0.833333,0.2,0.0,0.266667,0.9
50%,5093.0,0.888889,0.555556,0.0,0.7,0.833333,0.9,0.555556,0.466667,0.2,0.006726,0.4,0.2,0.0,0.0,0.0,0.0,0.666667,0.2,0.333333,0.002814,0.777778,0.933333,0.666667,0.933333,1.0,0.866667,0.666667,0.2,0.533333,0.9
75%,7482.0,0.933333,0.666667,0.0,0.777778,0.888889,0.933333,0.6,0.533333,0.333333,0.008982,0.5,0.5,0.0,0.0,0.0,0.0,0.833333,0.2,0.6,0.00353,0.866667,0.933333,0.7,1.0,1.0,0.888889,0.833333,0.2,0.777778,0.933333
max,9640.0,1.0,0.888889,0.533333,0.933333,1.0,1.0,0.733333,0.777778,0.8,0.025891,0.866667,0.933333,0.666667,0.4,0.733333,0.666667,0.933333,0.4,1.0,0.011654,0.933333,1.0,0.833333,1.0,1.0,0.933333,0.933333,0.333333,1.0,0.933333
