In [1]:
import re
import nltk
import data_helpers
import spacy
from spacy import displacy
% matplotlib inline

## Load Data

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [None]:
def labels_mapping(relation):
    labelsMapping = {'Other': 0,
                 'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                 'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                 'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                 'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                 'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                 'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                 'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                 'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}
    return labelsMapping[relation]

In [None]:
def read_data(path):
    with open(path, "r") as f:
        lines = [line.strip() for line in f.readlines()]
    return lines

def load_train_data(path):
    data = []
    data_label = []
    lines = read_data(path)     
    for idx in range(0, len(lines), 4):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        tokens = []
        for token in sentence.split(' '):
            if token.startswith('<e1>'):
                entity_1 = re.split('<e1>|</e1>',token)[1]
                tokens.append(entity_1)
            elif token.startswith('<e2>'):
                entity_2 = re.split('<e2>|</e2>',token)[1]
                tokens.append(entity_2)
            else:
                tokens.append(token)
        sentence = " ".join(tokens)
        sentence = clean_str(sentence)
        
        relation = lines[idx + 1]
        relation_label = labels_mapping(relation)
        #comment = lines[idx + 2].split('Comment:')[1]
        data.append([ID, sentence, entity_1, entity_2])
        data_label.append([ID, relation, relation_label])
    return data, data_label

def load_test_data(path):
    data = []
    lines = read_data(path)
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        tokens = []
        for token in sentence.split(' '):
            if token.startswith('<e1>'):
                entity_1 = re.split('<e1>|</e1>',token)[1]
                tokens.append(entity_1)
            elif token.startswith('<e2>'):
                entity_2 = re.split('<e2>|</e2>',token)[1]
                tokens.append(entity_2)
            else:
                tokens.append(token)
        sentence = " ".join(tokens)
        sentence = clean_str(sentence)
        data.append([ID, sentence, entity_1, entity_2])
    return data

def load_test_answer(path):
    lines = read_data(path)
    data = []
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        relation = lines[idx].split("\t")[1]
        relation_label = labels_mapping(relation)
        data.append([ID, relation, relation_label])
    return data

In [None]:
train, train_label = load_train_data('data/TRAIN_FILE.txt')
test = load_test_data('data/TEST_FILE.txt')
test_label = load_test_answer('data/answer_key.txt')

print("number of training instances:", len(train))
print("number of testing instances:", len(test))

In [None]:
print(train[0])
print(train_label[0])

In [None]:
print(test[0])
print(test_label[0])

### features

In [2]:
x_train_text, pos1_train, pos2_train, labels_train = data_helpers.load_data_and_labels('data/TRAIN_FILE.txt')
x_test_text, pos1_test, pos2_test, labels_test = data_helpers.load_data_and_labels('data/TEST_FILE_FULL.txt')
x_total = x_train_text + x_test_text

1. dependency parse

In [23]:
nlp = spacy.load('en_core_web_sm')

In [26]:
demo_sentence = x_train_text[4]
demo_sentence

'the student association is the voice of the undergraduate student population of the state university of new york at buffalo'

In [27]:

doc = nlp(demo_sentence)

for token in doc:
    ancestors = [i for i in token.ancestors]
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop, token.ent_type,ancestors)

# displacy.serve(doc, style='dep') 

the the DET DT det xxx True True 0 [association, is]
student student NOUN NN compound xxxx True False 0 [association, is]
association association NOUN NN nsubj xxxx True False 0 [is]
is be VERB VBZ ROOT xx True True 0 []
the the DET DT det xxx True True 0 [voice, is]
voice voice NOUN NN attr xxxx True False 0 [is]
of of ADP IN prep xx True True 0 [voice, is]
the the DET DT det xxx True True 0 [population, of, voice, is]
undergraduate undergraduate ADJ JJ amod xxxx True False 0 [student, population, of, voice, is]
student student NOUN NN compound xxxx True False 0 [population, of, voice, is]
population population NOUN NN pobj xxxx True False 0 [of, voice, is]
of of ADP IN prep xx True True 0 [population, of, voice, is]
the the DET DT det xxx True True 0 [university, of, population, of, voice, is]
state state NOUN NN compound xxxx True False 0 [university, of, population, of, voice, is]
university university NOUN NN pobj xxxx True False 0 [of, population, of, voice, is]
of of ADP IN prep

In [31]:
position_id = 2
print([token.text for token in doc[position_id].lefts])  # ['bright', 'red']
print([token.text for token in doc[position_id].rights])  # ['on']
print(doc[position_id].n_lefts)  # 2
print(doc[position_id].n_rights)  # 1

['the', 'student']
[]
2
0
