In [1]:
import re
import nltk

## Load Data

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:
def labels_mapping(relation):
    labelsMapping = {'Other': 0,
                 'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                 'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                 'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                 'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                 'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                 'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                 'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                 'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}
    return labelsMapping[relation]

In [4]:
def read_data(path):
    with open(path, "r") as f:
        lines = [line.strip() for line in f.readlines()]
    return lines

def load_train_data(path):
    data = []
    data_label = []
    lines = read_data(path)     
    for idx in range(0, len(lines), 4):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        tokens = []
        for token in sentence.split(' '):
            if token.startswith('<e1>'):
                entity_1 = re.split('<e1>|</e1>',token)[1]
                tokens.append(entity_1)
            elif token.startswith('<e2>'):
                entity_2 = re.split('<e2>|</e2>',token)[1]
                tokens.append(entity_2)
            else:
                tokens.append(token)
        sentence = " ".join(tokens)
        sentence = clean_str(sentence)
        
        relation = lines[idx + 1]
        relation_label = labels_mapping(relation)
        #comment = lines[idx + 2].split('Comment:')[1]
        data.append([ID, sentence, entity_1, entity_2])
        data_label.append([ID, relation, relation_label])
    return data, data_label

def load_test_data(path):
    data = []
    lines = read_data(path)
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        
        sentence = lines[idx].split("\t")[1][1:-1]
        tokens = []
        for token in sentence.split(' '):
            if token.startswith('<e1>'):
                entity_1 = re.split('<e1>|</e1>',token)[1]
                tokens.append(entity_1)
            elif token.startswith('<e2>'):
                entity_2 = re.split('<e2>|</e2>',token)[1]
                tokens.append(entity_2)
            else:
                tokens.append(token)
        sentence = " ".join(tokens)
        sentence = clean_str(sentence)
        data.append([ID, sentence, entity_1, entity_2])
    return data

def load_test_answer(path):
    lines = read_data(path)
    data = []
    for idx in range(0, len(lines)):
        ID = lines[idx].split("\t")[0]
        relation = lines[idx].split("\t")[1]
        relation_label = labels_mapping(relation)
        data.append([ID, relation, relation_label])
    return data

In [5]:
train, train_label = load_train_data('data/TRAIN_FILE.txt')
test = load_test_data('data/TEST_FILE.txt')
test_label = load_test_answer('data/answer_key.txt')

print("number of training instances:", len(train))
print("number of testing instances:", len(test))

number of training instances: 8000
number of testing instances: 2717


In [6]:
print(train[0])
print(train_label[0])

['1', 'the system as described above has its greatest application in an arrayed configuration of antenna elements', 'configuration', 'elements']
['1', 'Component-Whole(e2,e1)', 12]


In [69]:
print(test[0])
print(test_label[0])

['8001', 'the most common audits were about waste and recycling', 'audits', 'waste']
['8001', 'Message-Topic(e1,e2)', 1]
