In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("data"))
import zipfile
import sys
import time

# Any results you write to the current directory are saved as output.

['sample_submission_stage_1.csv', 'test_stage_1.tsv', 'trees']


In [2]:
import modeling
import extract_features
import tokenization
import tensorflow as tf

In [3]:
def compute_offset_no_spaces(text, offset):
	count = 0
	for pos in range(offset):
		if text[pos] != " ": count +=1
	return count

def count_chars_no_special(text):
	count = 0
	special_char_list = ["#"]
	for pos in range(len(text)):
		if text[pos] not in special_char_list: count +=1
	return count

def count_length_no_special(text):
	count = 0
	special_char_list = ["#", " "]
	for pos in range(len(text)):
		if text[pos] not in special_char_list: count +=1
	return count

In [4]:
def run_bert(data, output, embedding_size=1024, layer=-1):
	'''
	Runs a forward propagation of BERT on input text, extracting contextual word embeddings
	Input: data, a pandas DataFrame containing the information in one of the GAP files

	Output: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768)
	columns: "emb_A": the embedding for word A
	         "emb_B": the embedding for word B
	         "emb_P": the embedding for the pronoun
	         "label": the answer to the coreference problem: "A", "B" or "NEITHER"
	'''
    # From the current file, take the text only, and write it in a file which will be passed to BERT
	text = data["Text"]
	text.to_csv("input.txt", index = False, header = False)

    # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl
    # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers.

	res = os.system("python extract_features.py \
	  --input_file=input.txt \
	  --output_file=output.jsonl \
	  --vocab_file=cased_L-24_H-1024_A-16/vocab.txt \
	  --bert_config_file=cased_L-24_H-1024_A-16/bert_config.json \
	  --init_checkpoint=cased_L-24_H-1024_A-16/bert_model.ckpt \
	  --layers={} \
	  --max_seq_length=300 \
	  --batch_size=1".format(layer))
    
    
	bert_output = pd.read_json("output.jsonl", lines = True)

	os.system("rm output.jsonl")
	os.system("rm input.txt")

	index = data.index
	columns = ["emb_A", "emb_B", "emb_P", "label"]
	emb = pd.DataFrame(index = index, columns = columns)
	emb.index.name = "ID"

	for i in range(len(data)): # For each line in the data file
		# get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
		P = data.loc[i,"Pronoun"].lower()
		A = data.loc[i,"A"].lower()
		B = data.loc[i,"B"].lower()

		# For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
		P_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"Pronoun-offset"])
		A_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"A-offset"])
		B_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"B-offset"])
		# Figure out the length of A, B, not counting spaces or special characters
		A_length = count_length_no_special(A)
		B_length = count_length_no_special(B)

		# Initialize embeddings with zeros
		emb_A = np.zeros(embedding_size)
		emb_B = np.zeros(embedding_size)
		emb_P = np.zeros(embedding_size)

		# Initialize counts
		count_chars = 0
		cnt_A, cnt_B, cnt_P = 0, 0, 0

		features = pd.DataFrame(bert_output.loc[i,"features"]) # Get the BERT embeddings for the current line in the data file
		for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
			token = features.loc[j,"token"]

			# See if the character count until the current token matches the offset of any of the 3 target words
			if count_chars  == P_offset: 
				# print(token)
				emb_P += np.array(features.loc[j,"layers"][0]['values'])
				cnt_P += 1
			if count_chars in range(A_offset, A_offset + A_length): 
				# print(token)
				emb_A += np.array(features.loc[j,"layers"][0]['values'])
				cnt_A +=1
			if count_chars in range(B_offset, B_offset + B_length): 
				# print(token)
				emb_B += np.array(features.loc[j,"layers"][0]['values'])
				cnt_B +=1								
			# Update the character count
			count_chars += count_length_no_special(token)
		# Taking the average between tokens in the span of A or B, so divide the current value by the count	
		emb_A /= cnt_A
		emb_B /= cnt_B

		# Work out the label of the current piece of text
		label = "Neither"
		if (data.loc[i,"A-coref"] == True):
			label = "A"
		if (data.loc[i,"B-coref"] == True):
			label = "B"

		# Put everything together in emb
		emb.iloc[i] = [emb_A, emb_B, emb_P, label]

	return emb

In [7]:
print("Started at ", time.ctime())

for i in range(17, 24):
    print("{} Epoch".format(i))
    tag = 'bert-large-cased-seq300-'
    embedding_size = 1024
    layer = i
    tag = tag + str(layer)

    test_data = pd.read_csv("gap-test.tsv", sep = '\t')
    test_emb = run_bert(test_data, embedding_size=embedding_size, output="{}contextual_embeddings_gap_test.json".format(tag), layer=layer)
    test_emb.to_json("vector/bert_big_cased/{}contextual_embeddings_gap_test.json".format(tag), orient = 'columns')

    validation_data = pd.read_csv("gap-validation.tsv", sep = '\t')
    validation_emb = run_bert(validation_data, embedding_size=embedding_size, output="{}contextual_embeddings_gap_validation.json".format(tag), layer=layer)
    validation_emb.to_json("vector/bert_big_cased/{}contextual_embeddings_gap_validation.json".format(tag), orient = 'columns')

    development_data = pd.read_csv("gap-development.tsv", sep = '\t')
    development_emb = run_bert(development_data, embedding_size=embedding_size, output="{}contextual_embeddings_gap_development.json".format(tag), layer=layer)
    development_emb.to_json("vector/bert_big_cased/{}contextual_embeddings_gap_development.json".format(tag), orient = 'columns')
    print("Finished at ", time.ctime())

Started at  Sat Apr 13 16:46:49 2019
17 Epoch




Finished at  Sat Apr 13 17:01:05 2019
18 Epoch
Finished at  Sat Apr 13 17:15:29 2019
19 Epoch
Finished at  Sat Apr 13 17:30:41 2019
20 Epoch
Finished at  Sat Apr 13 17:45:36 2019
21 Epoch
Finished at  Sat Apr 13 18:00:47 2019
22 Epoch
Finished at  Sat Apr 13 18:16:46 2019
23 Epoch
Finished at  Sat Apr 13 18:33:09 2019


In [8]:
from keras import backend, models, layers, initializers, regularizers, constraints, optimizers
from keras import callbacks as kc
from keras import optimizers as ko

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import log_loss
import time


dense_layer_sizes = [37]
dropout_rate = 0.6
learning_rate = 0.001
n_fold = 5
batch_size = 32
epochs = 1000
patience = 100
# n_test = 100
lambd = 0.1 # L2 regularization

Using TensorFlow backend.


In [12]:
def build_mlp_model(input_shape):
	X_input = layers.Input(input_shape)
	X = layers.Dropout(dropout_rate, seed = 7)(X_input)
	# First dense layer
	X = layers.Dense(dense_layer_sizes[0], name = 'dense0')(X)
	X = layers.BatchNormalization(name = 'bn0')(X)
	X = layers.Activation('relu')(X)
	X = layers.Dropout(dropout_rate, seed = 7)(X)

	# Output layer
	X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X)
	X = layers.Activation('softmax')(X)

	# Create model
	model = models.Model(input = X_input, output = X, name = "classif_model")
	return model

In [13]:
def parse_json(embeddings):
	'''
	Parses the embeddigns given by BERT, and suitably formats them to be passed to the MLP model

	Input: embeddings, a DataFrame containing contextual embeddings from BERT, as well as the labels for the classification problem
	columns: "emb_A": contextual embedding for the word A
	         "emb_B": contextual embedding for the word B
	         "emb_P": contextual embedding for the pronoun
	         "label": the answer to the coreference problem: "A", "B" or "NEITHER"

	Output: X, a numpy array containing, for each line in the GAP file, the concatenation of the embeddings of the target words
	        Y, a numpy array containing, for each line in the GAP file, the one-hot encoded answer to the coreference problem
	'''
	embeddings.sort_index(inplace = True) # Sorting the DataFrame, because reading from the json file messed with the order
	X = np.zeros((len(embeddings),3*1024))
	Y = np.zeros((len(embeddings), 3))

	# Concatenate features
	for i in range(len(embeddings)):
		A = np.array(embeddings.loc[i,"emb_A"])
		B = np.array(embeddings.loc[i,"emb_B"])
		P = np.array(embeddings.loc[i,"emb_P"])
		X[i] = np.concatenate((A,B,P))

	# One-hot encoding for labels
	for i in range(len(embeddings)):
		label = embeddings.loc[i,"label"]
		if label == "A":
			Y[i,0] = 1
		elif label == "B":
			Y[i,1] = 1
		else:
			Y[i,2] = 1

	return X, Y

In [14]:
# Read development embeddigns from json file - this is the output of Bert
for i in range(17, 24):
    tag = 'bert-large-cased-seq300-'
    embedding_size = 1024
    layer = i
    tag = tag + str(layer)

    development = pd.read_json("vector/bert_big_cased/{}contextual_embeddings_gap_development.json".format(tag))
    X_development, Y_development = parse_json(development)

    validation = pd.read_json("vector/bert_big_cased/{}contextual_embeddings_gap_validation.json".format(tag))
    X_validation, Y_validation = parse_json(validation)

    test = pd.read_json("vector/bert_big_cased/{}contextual_embeddings_gap_test.json".format(tag))
    X_test, Y_test = parse_json(test)

    # There may be a few NaN values, where the offset of a target word is greater than the max_seq_length of BERT.
    # They are very few, so I'm just dropping the rows.
    remove_test = [row for row in range(len(X_test)) if np.sum(np.isnan(X_test[row]))]
    X_test = np.delete(X_test, remove_test, 0)
    Y_test = np.delete(Y_test, remove_test, 0)

    remove_validation = [row for row in range(len(X_validation)) if np.sum(np.isnan(X_validation[row]))]
    X_validation = np.delete(X_validation, remove_validation, 0)
    Y_validation = np.delete(Y_validation, remove_validation, 0)

    # We want predictions for all development rows. So instead of removing rows, make them 0
    remove_development = [row for row in range(len(X_development)) if np.sum(np.isnan(X_development[row]))]
    X_development[remove_development] = np.zeros(3*1024)

    # Will train on data from the gap-test and gap-validation files, in total 2454 rows
    X_train = np.concatenate((X_test, X_validation), axis = 0)
    Y_train = np.concatenate((Y_test, Y_validation), axis = 0)

    # Will predict probabilities for data from the gap-development file; initializing the predictions
    prediction = np.zeros((len(X_development),3)) # testing predictions

    # Training and cross-validation
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=3)
    scores = []
    oof = np.zeros_like(Y_train)
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
        # split training and validation data
        print('Fold', fold_n, 'started at', time.ctime())
        X_tr, X_val = X_train[train_index], X_train[valid_index]
        Y_tr, Y_val = Y_train[train_index], Y_train[valid_index]

        # Define the model, re-initializing for each fold
        classif_model = build_mlp_model([X_train.shape[1]])
        classif_model.compile(optimizer = optimizers.Adam(lr = learning_rate), loss = "categorical_crossentropy")
        callbacks = [kc.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights = True)]

        # train the model
        classif_model.fit(x = X_tr, y = Y_tr, epochs = epochs, batch_size = batch_size, callbacks = callbacks, validation_data = (X_val, Y_val), verbose = 0)

        # make predictions on validation and test data
        pred_valid = classif_model.predict(x = X_val, verbose = 0)
        oof[valid_index] = pred_valid
        pred = classif_model.predict(x = X_development, verbose = 0)

        # oof[valid_index] = pred_valid.reshape(-1,)
        scores.append(log_loss(Y_val, pred_valid))
        prediction += pred
    prediction /= n_fold

    # Print CV scores, as well as score on the test data
    print("In the {} layer".format(i))
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    print(scores)
    print("Test score:", log_loss(Y_development,prediction))
    
    # Write the prediction to file for submission
    oof_df = pd.DataFrame(oof)
    oof_df.to_csv("oof/oof_bert_cased_large_1024_seqlen300-L{}.csv".format(layer), index=False)
    
    submission = pd.read_csv("data/sample_submission_stage_1.csv", index_col = "ID")
    submission["A"] = prediction[:,0]
    submission["B"] = prediction[:,1]
    submission["NEITHER"] = prediction[:,2]
    submission.to_csv("outputs/submission_bert_cased_large_1024_seqlen300-L{}.csv".format(layer))

Fold 0 started at Sat Apr 13 18:37:20 2019
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


  from ipykernel import kernelapp as app


Fold 1 started at Sat Apr 13 18:38:06 2019
Fold 2 started at Sat Apr 13 18:39:03 2019
Fold 3 started at Sat Apr 13 18:39:39 2019
Fold 4 started at Sat Apr 13 18:40:40 2019
In the 17 layer
CV mean score: 0.4829, std: 0.0397.
[0.4346496581889228, 0.44157181763493675, 0.5370932914761926, 0.5123143833008907, 0.4888957987282213]
Test score: 0.43846707336013385
Fold 0 started at Sat Apr 13 18:42:06 2019
Fold 1 started at Sat Apr 13 18:43:00 2019
Fold 2 started at Sat Apr 13 18:44:10 2019
Fold 3 started at Sat Apr 13 18:45:14 2019
Fold 4 started at Sat Apr 13 18:45:57 2019
In the 18 layer
CV mean score: 0.4581, std: 0.0343.
[0.4193595699975912, 0.4183773830971185, 0.49989506578700443, 0.4901620703687713, 0.46277756677692217]
Test score: 0.42972178614517226
Fold 0 started at Sat Apr 13 18:46:55 2019
Fold 1 started at Sat Apr 13 18:48:07 2019
Fold 2 started at Sat Apr 13 18:49:36 2019
Fold 3 started at Sat Apr 13 18:50:43 2019
Fold 4 started at Sat Apr 13 18:51:38 2019
In the 19 layer
CV mean s

# Records

* Layer 01 CV mean score: 0.8874, std: 0.0154. Test score: 0.8495820849324255
* Layer 02 CV mean score: 0.8700, std: 0.0127. Test score: 0.8438746537152055
* Layer 03 CV mean score: 0.8042, std: 0.0209. Test score: 0.7804143238337299
* Layer 04 CV mean score: 0.7707, std: 0.0307. Test score: 0.7545189016261488
* Layer 05 CV mean score: 0.7750, std: 0.0202. Test score: 0.7548036766969642
* Layer 06 CV mean score: 0.7625, std: 0.0147. Test score: 0.7379605187821101
* Layer 07 CV mean score: 0.7603, std: 0.0148. Test score: 0.7383950867017735
* Layer 08 CV mean score: 0.7489, std: 0.0168. Test score: 0.7151597505941151
* Layer 09 CV mean score: 0.6968, std: 0.0223. Test score: 0.6628560374140844
* Layer 10 CV mean score: 0.6388, std: 0.0300. Test score: 0.6216318814585869
* Layer 11 CV mean score: 0.6115, std: 0.0214. Test score: 0.5907616362240837
* Layer 12 CV mean score: 0.5796, std: 0.0232. Test score: 0.5618664425925821
* Layer 13 CV mean score: 0.5436, std: 0.0287. Test score: 0.5226838338351586
* Layer 14 CV mean score: 0.5089, std: 0.0334. Test score: 0.47300375639454456
* Layer 15 CV mean score: 0.4669, std: 0.0401. Test score: 0.4352659449041463
* Layer 16 CV mean score: 0.4499, std: 0.0456. Test score: 0.4087953151634497
* Layer 17 CV mean score: 0.4361, std: 0.0257. Test score: 0.39111295910973526
* Layer 18 CV mean score: 0.4297, std: 0.0270. Test score: 0.3832015453128307
* Layer 19 CV mean score: 0.4181, std: 0.0243. Test score: 0.37968471044825763
* Layer 20 CV mean score: 0.4457, std: 0.0240. Test score: 0.40375841880593566
* Layer 21 CV mean score: 0.4636, std: 0.0275. Test score: 0.42320501486081
* Layer 22 CV mean score: 0.4882, std: 0.0237. Test score: 0.44682373979905493
* Layer 23 CV mean score: 0.5017, std: 0.0190. Test score: 0.4601968420633977
* Layer 24 CV mean score: 0.5058, std: 0.0153. Test score: 0.46330062780985837