In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("data"))
import zipfile
import sys
import time

from keras import backend, models, layers, initializers, regularizers, constraints, optimizers
from keras import callbacks as kc
from keras import optimizers as ko

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import log_loss
import time


dense_layer_sizes = [37]
dropout_rate = 0.6
learning_rate = 0.001
n_fold = 5
batch_size = 32
epochs = 1000
patience = 100
# n_test = 100
lambd = 0.1 # L2 regularization

['sample_submission_stage_1.csv', 'test_stage_1.tsv', 'trees']


Using TensorFlow backend.


In [2]:
def build_mlp_model_for_base_bert(input_shape, split_size):
    X_input = layers.Input(input_shape)

    # First dense layer
    A = layers.Lambda(lambda x: x[:, :split_size])(X_input)
    B = layers.Lambda(lambda x: x[:, split_size:split_size*2])(X_input)
    P = layers.Lambda(lambda x: x[:, split_size*2:])(X_input)

    head_num = 6
    res = []
    for head in range(head_num):
        query_encoder = layers.Dense(dense_layer_sizes[0], activation='selu', kernel_regularizer = regularizers.l2(1e-6))
        ans_encoder = layers.Dense(dense_layer_sizes[0], activation='selu', kernel_regularizer = regularizers.l2(1e-6))
        
        d_ratio = 0.6
        
        a, b = query_encoder(layers.Dropout(d_ratio)(A)), query_encoder(layers.Dropout(d_ratio)(B))
        p = query_encoder(layers.Dropout(d_ratio)(P))
        
        amp = layers.Multiply()([a, p])
        bmp = layers.Multiply()([b, p])
        
        asp = layers.Lambda(lambda v: v[0] - v[1])([p, a])
        bsp = layers.Lambda(lambda v: v[0] - v[1])([p, b])        
        
        ia = layers.Concatenate()([a, p, amp, asp])
        ib = layers.Concatenate()([b, p, bmp, bsp])
        nli_encoder = layers.Dense(dense_layer_sizes[0], activation='selu')
        ia, ib = nli_encoder(ia), nli_encoder(ib)
        
        out = layers.Concatenate()([ia, ib])
        res.append(out)
    
    res = layers.Add()(res)
    res = layers.Dropout(0.8)(res)
    X = layers.Dense(dense_layer_sizes[0], name = 'dense0')(res)
    X = layers.BatchNormalization(name = 'bn0')(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout_rate, seed = 7)(X)

    # Output layer
    X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
    X = layers.Activation('softmax')(X)

    # Create model
    model = models.Model(input = X_input, output = X, name = "classif_model")
    return model

In [3]:
def build_mlp_model_for_larget_bert(input_shape, split_size):
    X_input = layers.Input(input_shape)

    # First dense layer
    A = layers.Lambda(lambda x: x[:, :split_size])(X_input)
    B = layers.Lambda(lambda x: x[:, split_size:split_size*2])(X_input)
    P = layers.Lambda(lambda x: x[:, split_size*2:])(X_input)

    head_num = 6
    res = []
    for head in range(head_num):
        query_encoder = layers.Dense(dense_layer_sizes[0], activation='selu', kernel_regularizer = regularizers.l2(1e-6))
        ans_encoder = layers.Dense(dense_layer_sizes[0], activation='selu', kernel_regularizer = regularizers.l2(1e-6))
        
        d_ratio = 0.7
        
        a, b = query_encoder(layers.Dropout(d_ratio)(A)), query_encoder(layers.Dropout(d_ratio)(B))
        p = query_encoder(layers.Dropout(d_ratio)(P))
        
        amp = layers.Multiply()([a, p])
        bmp = layers.Multiply()([b, p])
        
        asp = layers.Lambda(lambda v: v[0] - v[1])([p, a])
        bsp = layers.Lambda(lambda v: v[0] - v[1])([p, b])        
        
        ia = layers.Concatenate()([a, p, amp, asp])
        ib = layers.Concatenate()([b, p, bmp, bsp])
        nli_encoder = layers.Dense(dense_layer_sizes[0], activation='selu')
        ia, ib = nli_encoder(ia), nli_encoder(ib)
        
        out = layers.Concatenate()([ia, ib])
        res.append(out)
    
    res = layers.Concatenate()(res)
    res = layers.Dropout(0.85)(res)
    X = layers.Dense(dense_layer_sizes[0], name = 'dense0')(res)
    X = layers.BatchNormalization(name = 'bn0')(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout_rate, seed = 7)(X)
    
    # Output layer
    X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
    X = layers.Activation('softmax')(X)

    # Create model
    model = models.Model(input = X_input, output = X, name = "classif_model")
    return model

'''
------------------------------
For the model bert-large-uncased-seq256-19
CV mean score: 0.3441, std: 0.0215.
[0.3307294825242308, 0.3187784074490988, 0.34923320252765083, 0.3821268940718645, 0.3397000497226646]
Test score: 0.32621788223567494
------------------------------
'''

'\n------------------------------\nFor the model bert-large-uncased-seq256-19\nCV mean score: 0.3441, std: 0.0215.\n[0.3307294825242308, 0.3187784074490988, 0.34923320252765083, 0.3821268940718645, 0.3397000497226646]\nTest score: 0.32621788223567494\n------------------------------\n'

In [4]:
import keras.backend as K

def base_nli_model(input_shape, split_size):
    X_input = layers.Input(input_shape)

    # First dense layer
    A = layers.Lambda(lambda x: x[:, :split_size])(X_input)
    B = layers.Lambda(lambda x: x[:, split_size:split_size*2])(X_input)
    P = layers.Lambda(lambda x: x[:, split_size*2:])(X_input)

    X1 = layers.Dropout(0.6, seed = 7)(A)
    X2 = layers.Dropout(0.6, seed = 7)(B)
    Y = layers.Dropout(0.6, seed = 7)(P)    
    
    def interaction(a, b):
        sub = layers.Lambda(lambda a: K.abs(a[0] - a[1]))([a, b])
        mult = layers.Lambda(lambda a: a[0] * a[1])([a, b])
        return layers.Concatenate()([a, b, sub, mult,])    
    
    word_encoder = layers.Dense(512, activation='selu')
    X1 = word_encoder(X1)
    X2 = word_encoder(X2)
    Y = word_encoder(Y)

    I_X1_Y = interaction(X1, Y)
    I_X2_Y = interaction(X2, Y)

    dense_encoder = layers.Dense(128, activation='selu')

    I_X1_Y = layers.Dropout(0.75)(dense_encoder(I_X1_Y))
    I_X2_Y = layers.Dropout(0.75)(dense_encoder(I_X2_Y))
    features = layers.Concatenate()([I_X1_Y, I_X2_Y])

    X = layers.Dense(dense_layer_sizes[0], name = 'dense0')(features)
    X = layers.BatchNormalization(name = 'bn0')(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout_rate, seed = 7)(X)

    # Output layer
    X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
    X = layers.Activation('softmax')(X)

    # Create model
    model = models.Model(input = X_input, output = X, name = "classif_model")
    return model

In [5]:
def parse_json(embeddings, embedding_size):
    '''
    Parses the embeddigns given by BERT, and suitably formats them to be passed to the MLP model

    Input: embeddings, a DataFrame containing contextual embeddings from BERT, as well as the labels for the classification problem
    columns: "emb_A": contextual embedding for the word A
             "emb_B": contextual embedding for the word B
             "emb_P": contextual embedding for the pronoun
             "label": the answer to the coreference problem: "A", "B" or "NEITHER"

    Output: X, a numpy array containing, for each line in the GAP file, the concatenation of the embeddings of the target words
            Y, a numpy array containing, for each line in the GAP file, the one-hot encoded answer to the coreference problem
    '''
    embeddings.sort_index(inplace = True) # Sorting the DataFrame, because reading from the json file messed with the order
    X = np.zeros((len(embeddings),3* embedding_size))
    Y = np.zeros((len(embeddings), 3))

    # Concatenate features
    for i in range(len(embeddings)):
        A = np.array(embeddings.loc[i,"emb_A"])
        B = np.array(embeddings.loc[i,"emb_B"])
        P = np.array(embeddings.loc[i,"emb_P"])
        X[i] = np.concatenate((A,B,P))

    # One-hot encoding for labels
    for i in range(len(embeddings)):
        label = embeddings.loc[i,"label"]
        if label == "A":
            Y[i,0] = 1
        elif label == "B":
            Y[i,1] = 1
        else:
            Y[i,2] = 1

    return X, Y

In [6]:
dev_filename = "contextual_embeddings_gap_development.json"
val_filename = "contextual_embeddings_gap_validation.json"
test_filename = "contextual_embeddings_gap_test.json"

In [7]:
# Keep all BERTs learn on the same corpus
remove_test = []
remove_validation = []
remove_development = [209, 1506, 1988]

In [8]:
def train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold, model_tag, model_func,
                dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
                oof_folder='oof/', pred_folder='outputs/'):
    '''
    Arguments:
        tag (data_tag): the tag of model and vec
        embedding_size: the size of bert embedding
        layer: to indicate which layer of bert is going to use
        checkpoint_path: the path to model_checkpoint folder
        n_fold: the number of CV folds
        model_tag: the prefix of the prediction and oof file and model chekpoint
    '''

    tag = tag + str(layer) # follow the original naming style
    dev_filename = tag + dev_filename
    val_filename = tag + val_filename
    test_filename = tag + test_filename
    
    development = pd.read_json(os.path.join(dev_folder_path, dev_filename))
    X_development, Y_development = parse_json(development, embedding_size)

    validation = pd.read_json(os.path.join(val_folder_path, val_filename))
    X_validation, Y_validation = parse_json(validation, embedding_size)

    test = pd.read_json(os.path.join(test_folder_path, test_filename))
    X_test, Y_test = parse_json(test, embedding_size)

    # There may be a few NaN values, where the offset of a target word is greater than the max_seq_length of BERT.
    # They are very few, so I'm just dropping the rows.
    # remove_test = [row for row in range(len(X_test)) if np.sum(np.isnan(X_test[row].reshape(-1)))]
    X_test = np.delete(X_test, remove_test, 0)
    Y_test = np.delete(Y_test, remove_test, 0)

    # remove_validation = [row for row in range(len(X_validation)) if np.sum(np.isnan(X_validation[row].reshape(-1)))]
    X_validation = np.delete(X_validation, remove_validation, 0)
    Y_validation = np.delete(Y_validation, remove_validation, 0)

    # We want predictions for all development rows. So instead of removing rows, make them 0
    # remove_development = [row for row in range(len(X_development)) if np.sum(np.isnan(X_development[row].reshape(-1)))]
    X_development = np.delete(X_development, remove_development, 0)
    Y_development = np.delete(Y_development, remove_development, 0)
    
    # Will train on data from the gap-test and gap-validation files, in total 2454 rows
    X_train = np.concatenate((X_test, X_validation, X_development), axis = 0)
    Y_train = np.concatenate((Y_test, Y_validation, Y_development), axis = 0)

    # Will predict probabilities for data from the gap-development file; initializing the predictions
    prediction = np.zeros((len(X_development), 3)) # testing predictions

    # Training and cross-validation
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=3)
    scores = []
    oof = np.zeros_like(Y_train)
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
        # split training and validation data
        print('Fold', fold_n, 'started at', time.ctime())
        X_tr, X_val = X_train[train_index], X_train[valid_index]
        Y_tr, Y_val = Y_train[train_index], Y_train[valid_index]

        # Define the model, re-initializing for each fold
        classif_model = model_func([X_train.shape[-1]], split_size=embedding_size)
        classif_model.compile(optimizer=optimizers.Adam(lr=learning_rate), loss="categorical_crossentropy")
        
        callbacks = [kc.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
                     kc.ModelCheckpoint(os.path.join(checkpoint_path, model_tag + tag + str(fold_n) + '.pt'), monitor='val_loss', verbose=0, save_best_only=True, mode='min')]

        # train the model
        classif_model.fit(x=X_tr, y=Y_tr, epochs=epochs, batch_size=batch_size, 
                          callbacks=callbacks, validation_data=(X_val, Y_val), verbose=0)

        # make predictions on validation and test data
        pred_valid = classif_model.predict(x=X_val, verbose=0)
        oof[valid_index] = pred_valid
        pred = classif_model.predict(x=X_development, verbose=0)

        # oof[valid_index] = pred_valid.reshape(-1,)
        scores.append(log_loss(Y_val, pred_valid))
        prediction += pred
    
    prediction /= n_fold
    
    print("-" * 30)
    print("For the model", tag)
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    print(scores)
    
    # Write the prediction to file for submission
    oof_df = pd.DataFrame(oof)
    oof_df.to_csv(oof_folder + model_tag + tag + ".csv", index=False)

# Bert Base
## Uncased

In [9]:
dev_folder_path = val_folder_path = test_folder_path = "vector/bert_base"
embedding_size = 768
layer = 8
checkpoint_path = "stage_1_checkpoints/"
n_fold = 7
tag = "bert-base-uncased-seq512-"
pred_tag = "nli-mh-"
model_func = build_mlp_model_for_base_bert

In [10]:
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 15:06:04 2019




Fold 1 started at Mon Apr 15 15:07:32 2019
Fold 2 started at Mon Apr 15 15:09:23 2019
Fold 3 started at Mon Apr 15 15:11:10 2019
Fold 4 started at Mon Apr 15 15:13:08 2019
Fold 5 started at Mon Apr 15 15:14:59 2019
Fold 6 started at Mon Apr 15 15:16:51 2019
------------------------------
For the model bert-base-uncased-seq512-8
CV mean score: 0.3897, std: 0.0303.
[0.41874208962182374, 0.3576011612451981, 0.42278881478925096, 0.35695556738233025, 0.35123012553249394, 0.4055829368912067, 0.414897930121091]


In [11]:
pred_tag = "bnli-mh-"
model_func = base_nli_model
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 15:19:20 2019




Fold 1 started at Mon Apr 15 15:20:01 2019
Fold 2 started at Mon Apr 15 15:20:33 2019
Fold 3 started at Mon Apr 15 15:21:09 2019
Fold 4 started at Mon Apr 15 15:21:44 2019
Fold 5 started at Mon Apr 15 15:22:24 2019
Fold 6 started at Mon Apr 15 15:23:05 2019
------------------------------
For the model bert-base-uncased-seq512-8
CV mean score: 0.3894, std: 0.0345.
[0.43335572715629767, 0.36244312029276277, 0.4074310181190556, 0.35165307776648813, 0.3416434165458056, 0.400636516729902, 0.42857470172439066]


## Cased

In [12]:
dev_folder_path = val_folder_path = test_folder_path = "vector/bert_base_cased"
embedding_size = 768
layer = 8
checkpoint_path = "stage_1_checkpoints/"
n_fold = 7
tag = "bert-base-cased-seq512-"
pred_tag = "nli-mh-"
model_func = build_mlp_model_for_base_bert

In [13]:
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 15:23:53 2019




Fold 1 started at Mon Apr 15 15:25:59 2019
Fold 2 started at Mon Apr 15 15:28:36 2019
Fold 3 started at Mon Apr 15 15:30:33 2019
Fold 4 started at Mon Apr 15 15:32:45 2019
Fold 5 started at Mon Apr 15 15:35:03 2019
Fold 6 started at Mon Apr 15 15:37:19 2019
------------------------------
For the model bert-base-cased-seq512-8
CV mean score: 0.4394, std: 0.0275.
[0.4649052174103669, 0.4075077879509842, 0.4420038184371437, 0.4103142511106075, 0.41890867841885787, 0.4445338689644511, 0.48744606034038224]


In [14]:
pred_tag = "bnli-mh-"
model_func = base_nli_model
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 15:39:28 2019




Fold 1 started at Mon Apr 15 15:40:18 2019
Fold 2 started at Mon Apr 15 15:41:02 2019
Fold 3 started at Mon Apr 15 15:41:52 2019
Fold 4 started at Mon Apr 15 15:42:47 2019
Fold 5 started at Mon Apr 15 15:43:39 2019
Fold 6 started at Mon Apr 15 15:44:31 2019
------------------------------
For the model bert-base-cased-seq512-8
CV mean score: 0.4412, std: 0.0335.
[0.4671664478725156, 0.40499301248086533, 0.44609084672334093, 0.40980316669044653, 0.4086470947931843, 0.44959538845483027, 0.5023763490928792]


# Bert Big
## Uncased

In [15]:
dev_folder_path = val_folder_path = test_folder_path = "vector/bert_big"
embedding_size = 1024
layer = 19
checkpoint_path = "stage_1_checkpoints/"
n_fold = 7
tag = "bert-large-uncased-seq300-"
pred_tag = "nli-mh-"
model_func = build_mlp_model_for_larget_bert

In [16]:
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 15:45:25 2019




Fold 1 started at Mon Apr 15 15:47:50 2019
Fold 2 started at Mon Apr 15 15:49:59 2019
Fold 3 started at Mon Apr 15 15:51:58 2019
Fold 4 started at Mon Apr 15 15:54:53 2019
Fold 5 started at Mon Apr 15 15:57:54 2019
Fold 6 started at Mon Apr 15 15:59:56 2019
------------------------------
For the model bert-large-uncased-seq300-19
CV mean score: 0.3196, std: 0.0333.
[0.3042907009121622, 0.31883628042389156, 0.34259295103589726, 0.28846384482261234, 0.2685273202696522, 0.34067832038491824, 0.3739988215781444]


In [17]:
pred_tag = "bnli-mh-"
model_func = base_nli_model
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 16:02:23 2019




Fold 1 started at Mon Apr 15 16:03:16 2019
Fold 2 started at Mon Apr 15 16:04:08 2019
Fold 3 started at Mon Apr 15 16:05:11 2019
Fold 4 started at Mon Apr 15 16:06:09 2019
Fold 5 started at Mon Apr 15 16:07:20 2019
Fold 6 started at Mon Apr 15 16:08:26 2019
------------------------------
For the model bert-large-uncased-seq300-19
CV mean score: 0.3183, std: 0.0395.
[0.30168902294112826, 0.3380411308095498, 0.34800934412409756, 0.285086517611375, 0.2506359343848238, 0.32666263570260395, 0.37827246434799594]


## Cased

In [18]:
dev_folder_path = val_folder_path = test_folder_path = "vector/bert_big_cased"
embedding_size = 1024
layer = 18
checkpoint_path = "stage_1_checkpoints/"
n_fold = 7
tag = "bert-large-cased-seq300-"
pred_tag = "nli-mh-"
model_func = build_mlp_model_for_larget_bert

In [19]:
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 16:09:28 2019




Fold 1 started at Mon Apr 15 16:12:38 2019
Fold 2 started at Mon Apr 15 16:16:02 2019
Fold 3 started at Mon Apr 15 16:19:01 2019
Fold 4 started at Mon Apr 15 16:22:36 2019
Fold 5 started at Mon Apr 15 16:26:58 2019
Fold 6 started at Mon Apr 15 16:29:58 2019
------------------------------
For the model bert-large-cased-seq300-18
CV mean score: 0.3592, std: 0.0317.
[0.3704772674046243, 0.3360922776462632, 0.3821525974518586, 0.3268818250513997, 0.31075084876940323, 0.38929441007353516, 0.398596004242633]


In [20]:
pred_tag = "bnli-mh-"
model_func = base_nli_model
train_folds(tag, dev_folder_path, val_folder_path, test_folder_path, embedding_size, layer, checkpoint_path, n_fold,
            dev_filename=dev_filename, val_filename=val_filename, test_filename=test_filename,
            oof_folder='oof/', pred_folder='outputs/', model_tag=pred_tag, model_func=model_func)

Fold 0 started at Mon Apr 15 16:33:04 2019




Fold 1 started at Mon Apr 15 16:34:24 2019
Fold 2 started at Mon Apr 15 16:35:46 2019
Fold 3 started at Mon Apr 15 16:36:55 2019
Fold 4 started at Mon Apr 15 16:38:18 2019
Fold 5 started at Mon Apr 15 16:39:37 2019
Fold 6 started at Mon Apr 15 16:41:06 2019
------------------------------
For the model bert-large-cased-seq300-18
CV mean score: 0.3593, std: 0.0312.
[0.38562625229912195, 0.35086932665420956, 0.3764865739071894, 0.33176049842435495, 0.30063397475827996, 0.3739431194782421, 0.3960768230630664]
