In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import gc
print(os.listdir("../input"))


['gap-validation.tsv', 'test_stage_1.tsv', 'sample_submission_stage_1.csv', 'gap-test.tsv', 'gap-development.tsv']


In [2]:
DATA_ROOT = '../input/'
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap-coreference')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')
# FAST_TEXT_DATA_FOLDER = os.path.join(DATA_ROOT, 'fasttext-crawl-300d-2m')

In [4]:
test_df_path = os.path.join(DATA_ROOT, 'gap-development.tsv')
train_df_path = os.path.join(DATA_ROOT, 'gap-test.tsv')
dev_df_path = os.path.join(DATA_ROOT, 'gap-validation.tsv')

train_df = pd.read_csv(train_df_path, sep='\t')
test_df = pd.read_csv(test_df_path, sep='\t')
dev_df = pd.read_csv(dev_df_path, sep='\t')

# pd.options.display.max_colwidth = 1000

In [5]:
test_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera


In [9]:
spacy_model = "en_core_web_sm"

In [7]:
from spacy.lang.en import English
from spacy.pipeline import DependencyParser
import spacy
from nltk import Tree

from keras.preprocessing import sequence
from keras.preprocessing import text as ktext

Using TensorFlow backend.


In [10]:
nlp = spacy.load(spacy_model)

def bs(list_, target_):
    lo, hi = 0, len(list_) -1
    
    while lo < hi:
        mid = lo + int((hi - lo) / 2)
        
        if target_ < list_[mid]:
            hi = mid
        elif target_ > list_[mid]:
            lo = mid + 1
        else:
            return mid + 1
    return lo

def bs_(list_, target_):
    lo, hi = 0, len(list_) -1
    
    while lo < hi:
        mid = lo + int((hi - lo) / 2)
        
        if target_ < list_[mid]:
            hi = mid
        elif target_ > list_[mid]:
            lo = mid + 1
        else:
            return mid
    return lo

def ohe_dist(dist, buckets):
    idx = bs_(buckets, dist)
    oh = np.zeros(shape=(len(buckets),), dtype=np.float32)
    oh[idx] = 1
    
    return oh

In [11]:
num_pos_features = 45

In [12]:
def extrac_positional_features(text, char_offset1, char_offset2):
    doc = nlp(text)
    max_len = 64
    
    # char offset to token offset
    lens = [token.idx for token in doc]
    mention_offset1 = bs(lens, char_offset1) - 1
    mention_offset2 = bs(lens, char_offset2) - 1
    
    # token offset to sentence offset
    lens = [len(sent) for sent in doc.sents]
    acc_lens = [len_ for len_ in lens]
    pre_len = 0
    for i in range(0, len(acc_lens)):
        pre_len += acc_lens[i]
        acc_lens[i] = pre_len
    sent_index1 = bs(acc_lens, mention_offset1)
    sent_index2 = bs(acc_lens, mention_offset2)
    
    sent1 = list(doc.sents)[sent_index1]
    sent2 = list(doc.sents)[sent_index2]
    
    # buckets
    bucket_dist = [1, 2, 3, 4, 5, 8, 16, 32, 64]
    
    # relative distance
    dist = mention_offset2 - mention_offset1
    dist_oh = ohe_dist(dist, bucket_dist)
    
    # buckets
    bucket_pos = [0, 1, 2, 3, 4, 5, 8, 16, 32]
    
    # absolute position in the sentence
    sent_pos1 = mention_offset1 + 1
    if sent_index1 > 0:
        sent_pos1 = mention_offset1 - acc_lens[sent_index1-1]
    sent_pos_oh1 = ohe_dist(sent_pos1, bucket_pos)
    sent_pos_inv1 = len(sent1) - sent_pos1
    assert sent_pos_inv1 >= 0
    sent_pos_inv_oh1 = ohe_dist(sent_pos_inv1, bucket_pos)
    
    sent_pos2 = mention_offset2 + 1
    if sent_index2 > 0:
        sent_pos2 = mention_offset2 - acc_lens[sent_index2-1]
    sent_pos_oh2 = ohe_dist(sent_pos2, bucket_pos)
    sent_pos_inv2 = len(sent2) - sent_pos2
    if sent_pos_inv2 < 0:
        print(sent_pos_inv2)
        print(len(sent2))
        print(sent_pos2)
        raise ValueError
    sent_pos_inv_oh2 = ohe_dist(sent_pos_inv2, bucket_pos)
    
    sent_pos_ratio1 = sent_pos1 / len(sent1)
    sent_pos_ratio2 = sent_pos2 / len(sent2)
    
    return dist_oh, sent_pos_oh1, sent_pos_oh2, sent_pos_inv_oh1, sent_pos_inv_oh2

In [13]:
def create_dist_features(df, text_column, pronoun_offset_column, name_offset_column):
    text_offset_list = df[[text_column, pronoun_offset_column, name_offset_column]].values.tolist()
    num_features = num_pos_features
    
    pos_feature_matrix = np.zeros(shape=(len(text_offset_list), num_features))
    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        dist_oh, sent_pos_oh1, sent_pos_oh2, sent_pos_inv_oh1, sent_pos_inv_oh2 = extrac_positional_features(text_offset[0], text_offset[1], text_offset[2])
        
        feature_index = 0
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(dist_oh)] = np.asarray(dist_oh)
        feature_index += len(dist_oh)
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_oh1)] = np.asarray(sent_pos_oh1)
        feature_index += len(sent_pos_oh1)
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_oh2)] = np.asarray(sent_pos_oh2)
        feature_index += len(sent_pos_oh2)
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_inv_oh1)] = np.asarray(sent_pos_inv_oh1)
        feature_index += len(sent_pos_inv_oh1)
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_inv_oh2)] = np.asarray(sent_pos_inv_oh2)
        feature_index += len(sent_pos_inv_oh2)
    
    return pos_feature_matrix

In [14]:
max_len = 50 # longer than 99% of the sentences

In [15]:
seq_list = list()
def extract_sents(text, char_offset_p, char_offset_a, char_offset_b, id):
    global max_len
    global seq_list
    
    seq_list.append(list())
    
    doc = nlp(text)
    token_lens = [token.idx for token in doc]
    
    char_offsets = [char_offset_p, char_offset_a, char_offset_b]
    sent_list = list()
    
    for char_offset in char_offsets:
        # char offset to token offset
        mention_offset = bs(token_lens, char_offset) - 1
        # mention_word
        mention = doc[mention_offset]
    
        # token offset to sentence offset
        lens = [len(sent) for sent in doc.sents]
        acc_lens = [len_ for len_ in lens]
        pre_len = 0
        for i in range(0, len(acc_lens)):
            pre_len += acc_lens[i]
            acc_lens[i] = pre_len
        sent_index = bs(acc_lens, mention_offset)
        # mention sentence
        sent = list(doc.sents)[sent_index]
        
        # absolute position in the sentence
        sent_pos = mention_offset + 1
        if sent_index > 0:
            sent_pos = mention_offset - acc_lens[sent_index-1]
        
        # clip the sentence if it is longer than max length
        if len(sent) > max_len:
            # make sure the mention is in the sentence span
            if sent_pos < max_len-1:
                sent_list.append(sent[0:max_len].text)
                sent_list.append(sent_pos)
                seq_list[-1].append(sent[0:max_len])
            else:
                sent_list.append(sent[sent_pos-max_len+2 : min(sent_pos+2, len(sent))].text)
                sent_list.append(max_len-2)
                seq_list[-1].append(sent[sent_pos-max_len+2 : min(sent_pos+2, len(sent))])
        else:
            sent_list.append(sent.text)
            sent_list.append(sent_pos)
            seq_list[-1].append(sent)
        
    return pd.Series([id] + sent_list, index=['ID', 'Pronoun-Sent', 'Pronoun-Sent-Offset', 'A-Sent', 'A-Sent-Offset', 'B-Sent', 'B-Sent-Offset'])

def add_sent_columns(df, text_column, pronoun_offset_column, a_offset_column, b_offset_column):
    global seq_list
    seq_list = list()
    sent_df = df.apply(lambda row: extract_sents(row.loc[text_column], row[pronoun_offset_column], row[a_offset_column], row[b_offset_column], row['ID']), axis=1)
    df = df.join(sent_df.set_index('ID'), on='ID')
    return df, seq_list

In [16]:
seq_list = list()
train_df, train_tokenized = add_sent_columns(train_df, 'Text', 'Pronoun-offset', 'A-offset', 'B-offset')
seq_list = list()
test_df, test_tokenized = add_sent_columns(test_df, 'Text', 'Pronoun-offset', 'A-offset', 'B-offset')
seq_list = list()
dev_df, dev_tokenized = add_sent_columns(dev_df, 'Text', 'Pronoun-offset', 'A-offset', 'B-offset')

# df apply will call the first row twice, remove the first one
train_tokenized = train_tokenized[1:]
test_tokenized = test_tokenized[1:]
dev_tokenized = dev_tokenized[1:]

KeyboardInterrupt: 

In [17]:
train_df

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL,Pronoun-Sent,Pronoun-Sent-Offset,A-Sent,A-Sent-Offset,B-Sent,B-Sent-Offset
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner,His cousin is Minnesota Wild's alternate capta...,0,Former NHLer Gary Suter and Olympic-medalist B...,8,Former NHLer Gary Suter and Olympic-medalist B...,11
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso,During the 1981 ``Nacional'' tournament (which...,28,During the 1981 ``Nacional'' tournament (which...,15,During the 1981 ``Nacional'' tournament (which...,21
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh,He was ambushed with his pregnant wife on his ...,0,"A contributor to Iraq's liberation, Ali Aladha...",7,"A contributor to Iraq's liberation, Ali Aladha...",15
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta,During his trial Pisciotta could not account f...,1,"However the MPs Mattarella, Alliata and Marche...",5,During his trial Pisciotta could not account f...,3
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers,"A day before his discharge, Eddie is assigned ...",3,"A day before his discharge, Eddie is assigned ...",6,"A day before his discharge, Eddie is assigned ...",31
5,test-6,The others were Adam Baldwin (Jayne Cobb in Fi...,her,349,Jewel Staite,281,True,Keller,310,False,http://en.wikipedia.org/wiki/Jennifer_Keller,The producers decided to cast Jewel Staite for...,17,The producers decided to cast Jewel Staite for...,5,The producers decided to cast Jewel Staite for...,11
6,test-7,"Allison Fischer (born October 19, 1988) is an ...",She,365,Allison,232,True,Grace Smythe,290,False,http://en.wikipedia.org/wiki/Allison_Fischer,She played Lady Jane in the Off-Broadway music...,0,"Also in 2000, Allison performed on Broadway at...",4,"Also in 2000, Allison performed on Broadway at...",13
7,test-8,"The monster arrives and bites Jeni's tongue, b...",her,307,Sophie,248,False,Jeni,277,True,http://en.wikipedia.org/wiki/Leprechaun:_Origins,The remaining trio head back to the cottage to...,32,The remaining trio head back to the cottage to...,21,The remaining trio head back to the cottage to...,26
8,test-9,"On June 4, 1973 at the Felt Forum, Madison Squ...",he,227,Malave,124,True,Greg Joiner,169,False,http://en.wikipedia.org/wiki/Edwin_Malave,Then he faced former World Lightweight Champio...,1,"Malave took a fight in Boston, Mass. against G...",0,"Malave took a fight in Boston, Mass. against G...",9
9,test-10,Go Away (Lorrie Morgan song) ``Go Away'' is a ...,her,223,Cathy Majeski,78,False,Lorrie Morgan,154,True,http://en.wikipedia.org/wiki/Go_Away_(Lorrie_M...,It was released in July 1997 as the first sing...,11,Go Away (Lorrie Morgan song) ``Go Away'' is a ...,20,Go Away (Lorrie Morgan song) ``Go Away'' is a ...,33
