# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preprocessing</a></div><div class="lev2 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev3 toc-item"><a href="#Load-Train-Data" data-toc-modified-id="Load-Train-Data-111"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Load Train Data</a></div><div class="lev3 toc-item"><a href="#Train-Data-Sample" data-toc-modified-id="Train-Data-Sample-112"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Train Data Sample</a></div><div class="lev3 toc-item"><a href="#Load-Test-Data" data-toc-modified-id="Load-Test-Data-113"><span class="toc-item-num">1.1.3&nbsp;&nbsp;</span>Load Test Data</a></div><div class="lev3 toc-item"><a href="#Test-Data-Sample" data-toc-modified-id="Test-Data-Sample-114"><span class="toc-item-num">1.1.4&nbsp;&nbsp;</span>Test Data Sample</a></div><div class="lev3 toc-item"><a href="#Relation-Types" data-toc-modified-id="Relation-Types-115"><span class="toc-item-num">1.1.5&nbsp;&nbsp;</span>Relation Types</a></div><div class="lev2 toc-item"><a href="#Participle" data-toc-modified-id="Participle-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Participle</a></div><div class="lev2 toc-item"><a href="#Make-Adjacency-List" data-toc-modified-id="Make-Adjacency-List-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Make Adjacency List</a></div><div class="lev3 toc-item"><a href="#Make-Adjacency-List-of-Train-Data" data-toc-modified-id="Make-Adjacency-List-of-Train-Data-131"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Make Adjacency List of Train Data</a></div><div class="lev3 toc-item"><a href="#Make-Adjacency-List-of-Test-Data" data-toc-modified-id="Make-Adjacency-List-of-Test-Data-132"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Make Adjacency List of Test Data</a></div><div class="lev1 toc-item"><a href="#Build-Dateset" data-toc-modified-id="Build-Dateset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build Dateset</a></div><div class="lev1 toc-item"><a href="#Save-Dataset" data-toc-modified-id="Save-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Save Dataset</a></div><div class="lev1 toc-item"><a href="#Checkpoint" data-toc-modified-id="Checkpoint-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Checkpoint</a></div>

# Data Preprocessing

## Load Data

### Load Train Data

In [5]:
import json

In [6]:
sentText = []
relationMentions = []
relationLabels = []
entityMentions = []
entityLabels = []
em1Text = []
em2Text = []

with open("/Users/lizhn7/Downloads/DATA/nyt/train.json") as f:
    lines = f.readlines()

for line in lines:
    item = json.loads(line)
    # Exclude "None" label
    if not all(i['label'] == 'None' for i in item['relationMentions']):
        sentText.append(item['sentText'])
        relationMentions.append(item['relationMentions'])
        entityMentions.append(item['entityMentions'])
    
relationLabels = [[i['label'].split('/')[-1] for i in rM] for rM in relationMentions]
entityLabels = [[i['text'] for i in eM] for eM in entityMentions]
em1Text = [[i['em1Text'] for i in rM] for rM in relationMentions]
em2Text = [[i['em2Text'] for i in rM] for rM in relationMentions]

In [7]:
# Clean words
replaceDict = {
               'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
               'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
               'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e', 
               'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
               'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u', 'ü': 'u',
               'ñ': 'n',
               'É': 'E'
              }
em1Text = [[''.join([replaceDict.get(i, i) for i in e]) for e in eT] for eT in em1Text]
em2Text = [[''.join([replaceDict.get(i, i) for i in e]) for e in eT] for eT in em2Text]

### Train Data Sample

In [8]:
sentText[0]

'But that spasm of irritation by a master intimidator was minor compared with what Bobby Fischer , the erratic former world chess champion , dished out in March at a news conference in Reykjavik , Iceland .'

In [9]:
relationMentions[0]

[{'em1Text': 'Bobby Fischer',
  'em2Text': 'Iceland',
  'label': '/people/person/nationality'},
 {'em1Text': 'Iceland',
  'em2Text': 'Reykjavik',
  'label': '/location/country/capital'},
 {'em1Text': 'Iceland',
  'em2Text': 'Reykjavik',
  'label': '/location/location/contains'},
 {'em1Text': 'Bobby Fischer',
  'em2Text': 'Reykjavik',
  'label': '/people/deceased_person/place_of_death'}]

In [10]:
relationLabels[0]

['nationality', 'capital', 'contains', 'place_of_death']

In [11]:
entityMentions[0]

[{'label': 'PERSON', 'start': 0, 'text': 'Bobby Fischer'},
 {'label': 'LOCATION', 'start': 1, 'text': 'Reykjavik'},
 {'label': 'LOCATION', 'start': 2, 'text': 'Iceland'}]

In [12]:
entityLabels[0]

['Bobby Fischer', 'Reykjavik', 'Iceland']

In [13]:
em1Text[0]

['Bobby Fischer', 'Iceland', 'Iceland', 'Bobby Fischer']

In [14]:
em2Text[0]

['Iceland', 'Reykjavik', 'Reykjavik', 'Reykjavik']

### Load Test Data

In [15]:
t_sentText = []
t_relationMentions = []
t_relationLabels = []
t_entityMentions = []
t_entityLabels = []
t_em1Text = []
t_em2Text = []

with open("/Users/lizhn7/Downloads/DATA/nyt/test.json") as f:
    lines = f.readlines()

for line in lines:
    item = json.loads(line)
    t_sentText.append(item['sentText'])
    t_relationMentions.append(item['relationMentions'])
    t_entityMentions.append(item['entityMentions'])
    
t_relationLabels = [[i['label'].split('/')[-1] for i in rM] for rM in t_relationMentions]
t_entityLabels = [[i['text'] for i in eM] for eM in t_entityMentions]
t_em1Text = [[i['em1Text'] for i in rM] for rM in t_relationMentions]
t_em2Text = [[i['em2Text'] for i in rM] for rM in t_relationMentions]

In [16]:
# Clean words
t_replaceDict = {
               'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
               'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
               'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e', 
               'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
               'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u', 'ü': 'u',
               'ñ': 'n',
               'É': 'E'
              }
t_em1Text = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_em1Text]
t_em2Text = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_em2Text]
t_entityLabels = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_entityLabels]

### Test Data Sample

In [17]:
t_sentText[254]

'"His idea -- one that has rarely , if ever , been tried on a large scale by a major museum -- is to collect significant pieces of midcentury residential architecture , including houses by Rudolf M. Schindler , Richard Neutra , Frank Lloyd Wright and his son Lloyd Wright , and to treat them as both museum objects and as residences for curators ."\r\n'

In [18]:
t_relationMentions[254]

[{'em1Text': 'Richard Neutra', 'em2Text': 'Lloyd Wright', 'label': 'None'},
 {'em1Text': 'Lloyd Wright', 'em2Text': 'Richard Neutra', 'label': 'None'},
 {'em1Text': 'Frank Lloyd Wright',
  'em2Text': 'Lloyd Wright',
  'label': '/people/person/children'},
 {'em1Text': 'Lloyd Wright', 'em2Text': 'Frank Lloyd Wright', 'label': 'None'},
 {'em1Text': 'Richard Neutra',
  'em2Text': 'Frank Lloyd Wright',
  'label': 'None'},
 {'em1Text': 'Frank Lloyd Wright',
  'em2Text': 'Richard Neutra',
  'label': 'None'}]

In [19]:
t_relationLabels[254]

['None', 'None', 'children', 'None', 'None', 'None']

In [20]:
t_entityMentions[254]

[{'label': 'PERSON', 'start': 1, 'text': 'Richard Neutra'},
 {'label': 'PERSON', 'start': 2, 'text': 'Frank Lloyd Wright'},
 {'label': 'PERSON', 'start': 3, 'text': 'Lloyd Wright'}]

In [21]:
t_entityLabels[254]

['Richard Neutra', 'Frank Lloyd Wright', 'Lloyd Wright']

In [22]:
t_em1Text[254]

['Richard Neutra',
 'Lloyd Wright',
 'Frank Lloyd Wright',
 'Lloyd Wright',
 'Richard Neutra',
 'Frank Lloyd Wright']

In [23]:
t_em2Text[254]

['Lloyd Wright',
 'Richard Neutra',
 'Lloyd Wright',
 'Frank Lloyd Wright',
 'Frank Lloyd Wright',
 'Richard Neutra']

###  Relation Types

In [24]:
relationTypes = list(set([r for rl in relationLabels for r in rl]))
relationTypes

['administrative_divisions',
 'place_of_birth',
 'people',
 'capital',
 'place_founded',
 'advisors',
 'neighborhood_of',
 'major_shareholder_of',
 'location',
 'country',
 'place_lived',
 'nationality',
 'founders',
 'teams',
 'contains',
 'major_shareholders',
 'religion',
 'industry',
 'place_of_death',
 'geographic_distribution',
 'children',
 'ethnicity',
 'company',
 'profession']

In [25]:
t_relationTypes = list(set([r for rl in t_relationLabels for r in rl if r != 'None']))
t_relationTypes

['country',
 'place_lived',
 'place_of_death',
 'nationality',
 'founders',
 'administrative_divisions',
 'place_of_birth',
 'children',
 'contains',
 'capital',
 'neighborhood_of',
 'company']

## Participle

In [26]:
from nltk import regexp_tokenize

In [130]:
def cut(s):
    """
    Participle
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\d+(?:\.\d+)?%?       # numbers, incl. currency and percentages 
              |\w+(?:[-&.']\w+)*       # words w/ optional internal hyphens/apostrophe  
           '''  
    return regexp_tokenize(s, pattern=pattern)

In [131]:
sentWords = [cut(s) for s in sentText]
entlabWords = [[cut(s) for s in eL] for eL in entityLabels]
em1Words = [[cut(s) for s in eL] for eL in em1Text]
em2Words = [[cut(s) for s in eL] for eL in em2Text]
t_sentWords = [cut(s) for s in t_sentText]
t_entlabWords = [[cut(s) for s in eL] for eL in t_entityLabels]
t_em1Words = [[cut(s) for s in eL] for eL in t_em1Text]
t_em2Words = [[cut(s) for s in eL] for eL in t_em2Text]

In [29]:
#MAX_SENT_LEN = 120
#AX_ADJL_LEN = 20

In [30]:
#len([i for i, j in enumerate(sentWords) if len(j) > 120])
#max(len(j) for i, j in enumerate(t_sentWords) if len(j) > 80)

## Make Adjacency List

### Make Adjacency List of Train Data

In [27]:
from collections import Counter

In [15]:
i_entityLabels = []
for i in range(len(sentWords)):
    eL = []
    sDict = list(enumerate(sentWords[i]))
    for item in entlabWords[i]:
        j = 0
        el = []
        for e in item:
            while j < len(sDict):
                if e == sDict[j][1]:
                    el.append(sDict[j][0])
                    j += 1
                    break
                j += 1
        eL.append(el)
    i_entityLabels.append(eL)

In [18]:
i_em1Text = []
for i in range(len(sentWords)):
    temp = list(zip(sum(i_entityLabels[i], []), sum(entlabWords[i], [])))
    eM1 = []
    for ee in em1Words[i]:
        j = 0
        em1 = []
        for e in ee:
            while j < len(temp):
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] == 1:
                    em1.append(temp[j][0])
                    break
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] > 1:
                    em1.append(temp[j][0])
                    j += 1
                    break
                j += 1
        eM1.append(em1)
    i_em1Text.append(eM1)

i_em2Text = []
for i in range(len(sentWords)):
    temp = list(zip(sum(i_entityLabels[i], []), sum(entlabWords[i], [])))
    eM2 = []
    for ee in em2Words[i]:
        j = 0
        em2 = []
        for e in ee:
            while j < len(temp):
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] == 1:
                    em2.append(temp[j][0])
                    break
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] > 1:
                    em2.append(temp[j][0])
                    j += 1
                    break
                j += 1
        eM2.append(em2)
    i_em2Text.append(eM2)

In [19]:
i_em1Text

[[[14, 15], [32], [32], [14, 15]],
 [[26, 27]],
 [[9]],
 [[13, 14, 15], [13, 14, 15], [13, 14, 15]],
 [[13], [13], [12], [13]],
 [[12]],
 [[1]],
 [[24, 25, 26]],
 [[30, 31, 32]],
 [[25]],
 [[8], [8]],
 [[40, 41], [25], [25], [25]],
 [[2], [10, 11], [2], [2]],
 [[5]],
 [[23, 24]],
 [[6, 7]],
 [[9], [9], [5, 6], [8]],
 [[9, 10], [30, 31]],
 [[32], [32]],
 [[2]],
 [[3]],
 [[6, 7]],
 [[19, 20]],
 [[3, 4]],
 [[2, 3]],
 [[36, 37]],
 [[12], [12]],
 [[32, 33], [42, 43]],
 [[16, 17, 18]],
 [[2, 3], [13, 14], [15]],
 [[21, 22], [21, 22]],
 [[15, 16]],
 [[40, 41, 42]],
 [[18, 37], [18, 37]],
 [[70], [70]],
 [[22], [3], [22], [22], [3], [45]],
 [[5, 6], [5, 6], [5, 6]],
 [[16, 17]],
 [[3, 4]],
 [[0], [0]],
 [[6, 7], [6, 7], [6, 7]],
 [[17, 18]],
 [[39]],
 [[7]],
 [[7, 8], [20]],
 [[41]],
 [[23, 24]],
 [[11, 12]],
 [[21, 22]],
 [[3, 4]],
 [[8, 9]],
 [[19], [19], [19], [18]],
 [[31]],
 [[20], [1]],
 [[4, 5]],
 [[9], [9], [9], [9]],
 [[86, 87], [86, 87]],
 [[21, 22]],
 [[14], [14]],
 [[3, 4]],
 [[0, 

In [165]:
adjacencyList = []
sentence = []
new_entityLabels = []
newi_entityLabels = []
padDict = ['POI', 'RE', 'EOP']
for i in range(len(sentWords)):
    # POI means pointer
    # RE means relation label
    # EOP means end of pointer   
    aL = ''
    for j in entityLabels[i]:
        dictEn = dict((b, a) for a, b in enumerate(entityLabels[i]))
        # Exclude further "None" label
        if j in em1Text[i] and j in [it['em1Text'] for it in relationMentions[i] if it['label'].split('/')[-1] in t_relationTypes]:
            aL += j
            in_em1Text = [dictEn[item] for item in em1Text[i]]
            in_em2Text = [dictEn[item] for item in em2Text[i]]
            new_em1 = [m[0] for m in sorted(list(zip(in_em1Text, in_em2Text)))]
            new_em2 = [n[1] for n in sorted(list(zip(in_em1Text, in_em2Text)))]
            new_em1 = [entityLabels[i][e1] for e1 in new_em1]
            new_em2 = [entityLabels[i][e2] for e2 in new_em2]
            listRe = list(zip(zip(in_em1Text, in_em2Text), relationLabels[i]))
            new_rel = [r[-1] for r in sorted(listRe, key=lambda z: z[0])]
            for item in enumerate(new_em1):
                if j == item[1] and new_rel[item[0]] in t_relationTypes:
                    aL = aL + ' POI ' + new_em2[item[0]] + ' RE ' + new_rel[item[0]]
            aL += ' EOP '
    if len(sentWords[i]) > MAX_SENT_LEN: #or len(aL.split()) > MAX_ADJL_LEN:
        aL = ''
    if aL != '':
        adjacencyList.append(aL)
        sentence.append(sentWords[i])
        new_entityLabels.append(entlabWords[i])
        newi_entityLabels.append(i_entityLabels[i])

In [166]:
i_adjacencyList = []
for i in range(len(adjacencyList)):
    aL = []
    replace = dict(zip(sum(new_entityLabels[i], []), sum(newi_entityLabels[i], [])))
    aL = [replace[j] if j not in padDict+t_relationTypes else j for j in cut(adjacencyList[i])]
    #aL = [s if type(s) == str else str(s) for s in aL]
    i_adjacencyList.append(aL)

In [168]:
#[i for i in i_adjacencyList if type(i[-2]) == int]

In [24]:
#all(type(al[-2]) == int for al in i_adjacencyList)

In [19]:
#[i for i in i_adjacencyList if i[1] == 'POI']

In [20]:
#[eT[0] for eT in enumerate(em1Text) for i in eT[1] if i == 'Édith Piaf']

### Make Adjacency List of Test Data

In [169]:
# Clean words
t_sentWords = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_sentWords]

In [175]:
ti_entityLabels = []
for i in range(len(t_sentWords)):
    eL = []
    t_sDict = list(enumerate(t_sentWords[i]))
    j = 0
    for item in t_entlabWords[i]:
        el = []
        for e in item:
            while j < len(t_sDict):
                if e == t_sDict[j][1]:
                    el.append(t_sDict[j][0])
                    j += 1
                    break
                j += 1
        eL.append(el)
    ti_entityLabels.append(eL)

In [176]:
ti_trueLables = [[i for i, j in enumerate(rl) if j != 'None'] for rl in t_relationLabels]
pad = ['POI', 'RE', 'EOP']
t_adjacencyList = []
for n in range(len(t_sentWords)): 
    e1 = []
    e2 = []
    aL = []
    for l in ti_trueLables[n]:
        for item in ti_entityLabels[n]:
            if ' '.join([t_sentWords[n][i] for i in item]) == ' '.join(t_em1Words[n][l]):
                e1.append(item[0])
            if ' '.join([t_sentWords[n][i] for i in item]) == ' '.join(t_em2Words[n][l]):
                e2.append(item[0])
    c = [(a, b) for a in e1 for b in e2]
    r = c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]
    aL = aL + [r[0], pad[0], r[1], pad[1], t_relationLabels[n][l], pad[2]]
    t_adjacencyList.append(aL)

# Modify 
t_adjacencyList[26] = [34, 'POI', 36, 'RE', 'founders', 'EOP']
t_adjacencyList[191] = [[22, 'POI', 36, 'RE', 'country', 'EOP'], [33, 'POI', 36, 'RE', 'country', 'EOP']]

In [269]:
' '.join(t_sentWords[38])

'The administration is now put in a position of playing defense as we are finally seeing the international consequences of the rather dramatic internal transformation inside Russia with the erosion of democracy a new ruling class a massive transfer of property rights from so-called oligarchs to basically friends of Putin most of whom are from the old K.G.B. Senior administration officials said the new effort to reach out to Russia already had been put into action and included Secretary of State Condoleezza Rice s consultation with her Russian counterpart Foreign Minister Sergey V. Lavrov during meetings in Berlin on Feb 21 and 22 about whether Russia had the appetite to pursue a second Security Council resolution for sanctions against Iran'

In [270]:
t_sentText[38]

'"The administration is now put in a position of playing defense , as we are finally seeing the international consequences of the rather dramatic internal transformation inside Russia with the erosion of democracy , a new ruling class , a massive transfer of property rights from so-called oligarchs to , basically , friends of Putin , most of whom are from the old K.G.B. \'\' Senior administration officials said the new effort to reach out to Russia already had been put into action , and included Secretary of State Condoleezza Rice \'s consultation with her Russian counterpart , Foreign Minister Sergey V. Lavrov , during meetings in Berlin on Feb. 21 and 22 about whether Russia had the appetite to pursue a second Security Council resolution for sanctions against Iran ."\r\n'

In [271]:
t_adjacencyList[38]

[105, 'POI', 119, 'RE', 'country', 'EOP']

In [272]:
t_entityLabels[38]

['Russia', 'Russia', 'Condoleezza Rice', 'Berlin', 'Russia', 'Iran']

In [273]:
ti_entityLabels[38]

[[26], [69], [81, 82], [97], [105], [119]]

In [274]:
t_relationMentions[38]

[{'em1Text': 'Condoleezza Rice', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Iran', 'label': 'None'},
 {'em1Text': 'Iran', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Russia',
  'em2Text': 'Iran',
  'label': '/location/administrative_division/country'},
 {'em1Text': 'Iran', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Condoleezza Rice', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Condoleezza Rice', 'em2Text': 'Iran', 'label': 'None'},
 {'em1Text': 'Iran', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Condolee

In [180]:
[i for i in enumerate(ti_trueLables) if len(i[-1]) > 1]

[(26, [2, 5]),
 (30, [7, 16]),
 (47, [2, 3]),
 (52, [5, 8]),
 (73, [13, 24]),
 (172, [3, 6]),
 (191, [15, 21]),
 (236, [0, 5]),
 (262, [1, 3]),
 (268, [4, 9]),
 (293, [2, 5]),
 (313, [5, 10]),
 (316, [13, 16]),
 (347, [3, 16]),
 (365, [3, 10])]

In [299]:
%xmode plain
ti_trueLables = [[i for i, j in enumerate(rl) if j != 'None'] for rl in t_relationLabels]
pad = ['POI', 'RE', 'EOP']
t_adjacencyList = []
for n in range(len(t_sentWords)): 
    e1 = []
    e2 = []
    aL = []
    for l in ti_trueLables[n]:
        for item in ti_entityLabels[n]:
            if ' '.join([t_sentWords[n][i] for i in item]) == t_em1Text[n][l]:
                e1.append(item[0])
            if ' '.join([t_sentWords[n][i] for i in item]) == t_em2Text[n][l]:
                e2.append(item[0])
        c = [(a, b) for a in e1 for b in e2]
        r = c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]
        aL = aL + [r[0], pad[0], r[1], pad[1], t_relationLabels[n][l], pad[2]]
    t_adjacencyList.append(aL)

Exception reporting mode: Plain


ValueError: min() arg is an empty sequence

In [302]:
ti_entityLabels[26]

[[0], [36, 37], [40], [45, 46]]

In [304]:
t_entityLabels[26]

['Sony', 'Akio Morita', 'Sony', 'United States']

In [305]:
t_relationMentions[26]

[{'em1Text': 'Sony', 'em2Text': 'United States', 'label': 'None'},
 {'em1Text': 'United States', 'em2Text': 'Sony', 'label': 'None'},
 {'em1Text': 'Sony',
  'em2Text': 'Akio Morita',
  'label': '/business/company/founders'},
 {'em1Text': 'Akio Morita', 'em2Text': 'Sony', 'label': 'None'},
 {'em1Text': 'Akio Morita', 'em2Text': 'Sony', 'label': 'None'},
 {'em1Text': 'Sony',
  'em2Text': 'Akio Morita',
  'label': '/business/company/founders'},
 {'em1Text': 'Akio Morita', 'em2Text': 'United States', 'label': 'None'},
 {'em1Text': 'United States', 'em2Text': 'Akio Morita', 'label': 'None'},
 {'em1Text': 'Sony', 'em2Text': 'Sony', 'label': 'None'}]

In [303]:
' '.join(t_sentWords[26])

'Sony Corporation wishes to express its deepest and most heartfelt sympathy to his family along with its utmost respect and gratitude for the instrumental role he played in assisting his middle school classmate and Sony founder Akio Morita in establishing Sony s presence in the United States in the early 1960 s'

In [289]:
l

7

In [291]:
[t_sentWords[n][i] for i in item]

['San', 'Antonio']

In [292]:
 ti_trueLables[n]

[7, 16]

In [290]:
t_relationMentions[30][7]

{'em1Text': 'Ralph de la Vega',
 'em2Text': 'AT&T',
 'label': '/business/person/company'}

In [285]:
e1

[20]

In [288]:
e2

[]

In [284]:
n

30

In [269]:
ti_entityLabels[n]

[[1], [4]]

In [278]:
t_sentWords[0][1]

'Tim'

In [273]:
item

[4]

In [274]:
e2

[4]

In [272]:
[t_sentWords[n][i] for i in item]

['Minnesota']

In [271]:
e1

[]

In [264]:
[t_sentWords[n][i] for i in item]

['Minnesota']

In [265]:
t_em1Text[n][l]

'Tim Pawlenty'

In [267]:
t_relationMentions[0]

[{'em1Text': 'Tim Pawlenty',
  'em2Text': 'Minnesota',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Minnesota', 'em2Text': 'Tim Pawlenty', 'label': 'None'}]

In [262]:
[[(i, j) for i, j in enumerate(rl) if j != 'None'] for rl in t_relationLabels]

[[(0, 'place_lived')],
 [(0, 'capital')],
 [(0, 'place_lived')],
 [(16, 'contains')],
 [(9, 'country')],
 [(1, 'contains')],
 [(0, 'company')],
 [(5, 'contains')],
 [(1, 'contains')],
 [(2, 'company')],
 [(1, 'country')],
 [(1, 'contains')],
 [(5, 'country')],
 [(2, 'country')],
 [(3, 'contains')],
 [(4, 'nationality')],
 [(9, 'contains')],
 [(1, 'country')],
 [(1, 'contains')],
 [(5, 'country')],
 [(2, 'children')],
 [(5, 'country')],
 [(2, 'country')],
 [(0, 'country')],
 [(1, 'contains')],
 [(3, 'contains')],
 [(2, 'founders'), (5, 'founders')],
 [(0, 'nationality')],
 [(5, 'country')],
 [(1, 'country')],
 [(7, 'company'), (16, 'company')],
 [(11, 'contains')],
 [(1, 'contains')],
 [(0, 'place_lived')],
 [(17, 'contains')],
 [(17, 'contains')],
 [(0, 'country')],
 [(1, 'contains')],
 [(4, 'country')],
 [(12, 'place_lived')],
 [(0, 'company')],
 [(1, 'contains')],
 [(1, 'founders')],
 [(0, 'place_lived')],
 [(4, 'company')],
 [(0, 'country')],
 [(0, 'nationality')],
 [(2, 'country'),

In [251]:
[[i for i, j in enumerate(rl) if j != 'None'] for rl in t_relationLabels][38][0]

4

In [252]:
c

[(26, 119), (69, 119), (105, 119)]

In [238]:
aL

[105, 'POI', 119, 'RE', 'country']

In [242]:
t_trueLables = [[i for i in rl if i != 'None'] for rl in t_relationLabels]

In [245]:
t_trueLables[38][0]

'country'

In [236]:
t_relationLabels[38]

['None',
 'None',
 'None',
 'None',
 'country',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None']

In [235]:
a = []
b = [1, 2, 3]
a + b

[1, 2, 3]

In [231]:
c

[(26, 119), (69, 119), (105, 119)]

In [229]:
c

[(26, 119), (69, 119), (105, 119)]

In [227]:
e1

[26]

In [211]:
c = [(a, b) for a in e1 for b in e2]
r = [(i, abs(j[0]-j[1])) for i, j in enumerate(c)]

In [219]:
c = [(26, 119), (69, 119), (105, 119), (133, 119)]

In [223]:
c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]

(105, 119)

In [None]:
a = [1, 2, 3]
b = [1]
[i for i in map(a, b)]

In [241]:
[[i for i in rl if i != 'None'] for rl in t_relationLabels][38]

['country']

In [193]:
e1

[26, 69, 105]

In [194]:
e2

[119]

In [233]:
t_adjacencyList = []
padDict = ['POI', 'RE', 'EOP']
for i in range(len(t_sentWords)):
    # POI means pointer
    # RE means relation label
    # EOP means end of pointer   
    aL = ''
    for j in ti_entityLabels[i]:
        t_dictEn = dict((b, a) for a, b in enumerate(t_entityLabels[i]))
        # Exclude further "None" label
        if j in [it['em1Text'] for it in t_relationMentions[i] if it['label'] != 'None' and j in it['em1Text']]:
            aL += j
            tin_em1Text = [t_dictEn[item] for item in t_em1Text[i]]
            tin_em2Text = [t_dictEn[item] for item in t_em2Text[i]]
            new_tem1 = [m[0] for m in sorted(list(zip(tin_em1Text, tin_em2Text)))]
            new_tem2 = [n[1] for n in sorted(list(zip(tin_em1Text, tin_em2Text)))]
            new_tem1 = [t_entityLabels[i][e1] for e1 in new_tem1]
            new_tem2 = [t_entityLabels[i][e2] for e2 in new_tem2]
            t_listRe = list(zip(zip(tin_em1Text, tin_em2Text), t_relationLabels[i]))
            new_trel = [r[-1] for r in sorted(t_listRe, key=lambda z: z[0])]
            for item in enumerate(new_tem1):
                if j == item[1] and new_trel[item[0]] != 'None':
                    aL = aL + ' POI ' + new_tem2[item[0]] + ' RE ' + new_trel[item[0]]
            aL += ' EOP '
    if aL != '':
        t_adjacencyList.append(aL)      

TypeError: 'in <string>' requires string as left operand, not list

In [232]:
t_adjacencyList

[]

In [143]:
t_entityLabels[38]

['Russia', 'Russia', 'Condoleezza Rice', 'Berlin', 'Russia', 'Iran']

In [165]:
temp = list(zip(sum(ti_entityLabels[38], []), t_entityLabels[38]))
temp

[(26, 'Russia'),
 (69, 'Russia'),
 (81, 'Condoleezza Rice'),
 (97, 'Berlin'),
 (105, 'Russia'),
 (119, 'Iran')]

In [166]:
e1 = []
e2 = []
for i, w in temp:
    if w == 'Russia':
        e1.append(i)
    if w == 'Iran':
        e2.append(i)

In [170]:
e1 

[26, 69, 105]

In [171]:
e2

[119]

In [186]:
[i for item in t_relationMentions for i in item if i['label'] != 'None']

[{'em1Text': 'Tim Pawlenty',
  'em2Text': 'Minnesota',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Guinea',
  'em2Text': 'Conakry',
  'label': '/location/country/capital'},
 {'em1Text': 'Ben Nelson',
  'em2Text': 'Nebraska',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Westchester',
  'em2Text': 'Mamaroneck',
  'label': '/location/location/contains'},
 {'em1Text': 'Damascus',
  'em2Text': 'Syria',
  'label': '/location/administrative_division/country'},
 {'em1Text': 'Florida',
  'em2Text': 'Boca Raton',
  'label': '/location/location/contains'},
 {'em1Text': 'Peter Bragdon',
  'em2Text': 'Columbia Sportswear',
  'label': '/business/person/company'},
 {'em1Text': 'Cuba',
  'em2Text': 'Guantanamo Bay',
  'label': '/location/location/contains'},
 {'em1Text': 'Rochester',
  'em2Text': 'George Eastman House',
  'label': '/location/location/contains'},
 {'em1Text': 'George Stephanopoulos',
  'em2Text': 'ABC News',
  'label': '/business/person/company'},
 {'em1Text': 'S

In [190]:
sum(t_relationMentions, [])

[{'em1Text': 'Tim Pawlenty',
  'em2Text': 'Minnesota',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Minnesota', 'em2Text': 'Tim Pawlenty', 'label': 'None'},
 {'em1Text': 'Guinea',
  'em2Text': 'Conakry',
  'label': '/location/country/capital'},
 {'em1Text': 'Conakry', 'em2Text': 'Guinea', 'label': 'None'},
 {'em1Text': 'Ben Nelson',
  'em2Text': 'Nebraska',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Nebraska', 'em2Text': 'Ben Nelson', 'label': 'None'},
 {'em1Text': 'Westchester', 'em2Text': 'New Rochelle', 'label': 'None'},
 {'em1Text': 'New Rochelle', 'em2Text': 'Westchester', 'label': 'None'},
 {'em1Text': 'New Jersey', 'em2Text': 'New Brunswick', 'label': 'None'},
 {'em1Text': 'New Brunswick', 'em2Text': 'New Jersey', 'label': 'None'},
 {'em1Text': 'New Brunswick', 'em2Text': 'Westchester', 'label': 'None'},
 {'em1Text': 'Westchester', 'em2Text': 'New Brunswick', 'label': 'None'},
 {'em1Text': 'Raritan River', 'em2Text': 'New Jersey', 'label': 'None'},
 {'em1

In [177]:
t_relationMentions[0]

[{'em1Text': 'Tim Pawlenty',
  'em2Text': 'Minnesota',
  'label': '/people/person/place_lived'},
 {'em1Text': 'Minnesota', 'em2Text': 'Tim Pawlenty', 'label': 'None'}]

In [174]:
t_relationMentions

[[{'em1Text': 'Tim Pawlenty',
   'em2Text': 'Minnesota',
   'label': '/people/person/place_lived'},
  {'em1Text': 'Minnesota', 'em2Text': 'Tim Pawlenty', 'label': 'None'}],
 [{'em1Text': 'Guinea',
   'em2Text': 'Conakry',
   'label': '/location/country/capital'},
  {'em1Text': 'Conakry', 'em2Text': 'Guinea', 'label': 'None'}],
 [{'em1Text': 'Ben Nelson',
   'em2Text': 'Nebraska',
   'label': '/people/person/place_lived'},
  {'em1Text': 'Nebraska', 'em2Text': 'Ben Nelson', 'label': 'None'}],
 [{'em1Text': 'Westchester', 'em2Text': 'New Rochelle', 'label': 'None'},
  {'em1Text': 'New Rochelle', 'em2Text': 'Westchester', 'label': 'None'},
  {'em1Text': 'New Jersey', 'em2Text': 'New Brunswick', 'label': 'None'},
  {'em1Text': 'New Brunswick', 'em2Text': 'New Jersey', 'label': 'None'},
  {'em1Text': 'New Brunswick', 'em2Text': 'Westchester', 'label': 'None'},
  {'em1Text': 'Westchester', 'em2Text': 'New Brunswick', 'label': 'None'},
  {'em1Text': 'Raritan River', 'em2Text': 'New Jersey', 'l

In [163]:
t_em2Text[38][4]

'Iran'

In [159]:
pmet = {i: w for w, i in temp.items()}
pmet

{'Berlin': 97, 'Condoleezza Rice': 81, 'Iran': 119, 'Russia': 105}

In [156]:
t_dictEn = dict((b, a) for a, b in enumerate(t_entityLabels[38]))
t_dictEn

{'Berlin': 3, 'Condoleezza Rice': 2, 'Iran': 5, 'Russia': 4}

In [55]:
t_adjacencyList = []
padDict = ['POI', 'RE', 'EOP']
for i in range(len(t_sentWords)):
    # POI means pointer
    # RE means relation label
    # EOP means end of pointer   
    aL = ''
    for j in t_entityLabels[i]:
        t_dictEn = dict((b, a) for a, b in enumerate(t_entityLabels[i]))
        # Exclude further "None" label
        if j in [it['em1Text'] for it in t_relationMentions[i] if it['label'] != 'None' and j in it['em1Text']]:
            aL += j
            tin_em1Text = [t_dictEn[item] for item in t_em1Text[i]]
            tin_em2Text = [t_dictEn[item] for item in t_em2Text[i]]
            new_tem1 = [m[0] for m in sorted(list(zip(tin_em1Text, tin_em2Text)))]
            new_tem2 = [n[1] for n in sorted(list(zip(tin_em1Text, tin_em2Text)))]
            new_tem1 = [t_entityLabels[i][e1] for e1 in new_tem1]
            new_tem2 = [t_entityLabels[i][e2] for e2 in new_tem2]
            t_listRe = list(zip(zip(tin_em1Text, tin_em2Text), t_relationLabels[i]))
            new_trel = [r[-1] for r in sorted(t_listRe, key=lambda z: z[0])]
            for item in enumerate(new_tem1):
                if j == item[1] and new_trel[item[0]] != 'None':
                    aL = aL + ' POI ' + new_tem2[item[0]] + ' RE ' + new_trel[item[0]]
            aL += ' EOP '
    if aL != '':
        t_adjacencyList.append(aL)      

In [56]:
t_adjacencyList[38]

'Russia POI Iran RE country EOP Russia POI Iran RE country EOP Russia POI Iran RE country EOP '

In [57]:
ti_adjacencyList = []
for i in range(len(t_adjacencyList)):
    aL = []
    replace = dict(zip(sum(t_entlabWords[i], []), sum(ti_entityLabels[i], [])))
    aL = [replace[j] if j not in padDict+t_relationTypes else j for j in cut(t_adjacencyList[i])]
    #aL = [s if type(s) == str else str(s) for s in aL]
    ti_adjacencyList.append(aL)

In [59]:
ti_adjacencyList[254]

[35, 41, 42, 'POI', 41, 42, 'RE', 'children', 'EOP']

In [175]:
t_adjacencyList[254]

'Frank Lloyd Wright POI Lloyd Wright RE children EOP '

In [32]:
def prt(w, r):
    s = [' '.join(i) for i in w]
    for i in range(len(s)):
        print('Sentence: %s' % s[i])
        print(r[i])
        print('---')

In [176]:
t_sentText[254]

'"His idea -- one that has rarely , if ever , been tried on a large scale by a major museum -- is to collect significant pieces of midcentury residential architecture , including houses by Rudolf M. Schindler , Richard Neutra , Frank Lloyd Wright and his son Lloyd Wright , and to treat them as both museum objects and as residences for curators ."\r\n'

In [177]:
set([len(i) for i in ti_adjacencyList])

{6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 20, 24, 28}

In [None]:
[8, 38, 227, 263, 287, 341, 373]

In [180]:
ti_adjacencyList[287]

[5, 6, 'POI', 5, 'RE', 'country', 'EOP']

In [181]:
t_relationMentions[287]

[{'em1Text': 'Mexico City', 'em2Text': 'Basel', 'label': 'None'},
 {'em1Text': 'Basel', 'em2Text': 'Mexico City', 'label': 'None'},
 {'em1Text': 'Basel', 'em2Text': 'Mexico', 'label': 'None'},
 {'em1Text': 'Mexico', 'em2Text': 'Basel', 'label': 'None'},
 {'em1Text': 'Mexico City',
  'em2Text': 'Mexico',
  'label': '/location/administrative_division/country'},
 {'em1Text': 'Mexico', 'em2Text': 'Mexico City', 'label': 'None'}]

In [184]:
ti_adjacencyList[8]

[29, 'POI', 25, 26, 27, 'RE', 'contains', 'EOP']

In [182]:
' '.join(t_sentWords[287])

'Curators have been snapping up Mexico City artwork in places like Basel now they will descend on the city this April for Mexico Arte Contemporaneo its fledging contemporary art fair'

In [183]:
ti_adjacencyList[38]

[26,
 'POI',
 119,
 'RE',
 'country',
 'EOP',
 26,
 'POI',
 119,
 'RE',
 'country',
 'EOP',
 26,
 'POI',
 119,
 'RE',
 'country',
 'EOP']

In [25]:
#import numpy as np
#np.max([len(i) for i in ti_adjacencyList])

In [26]:
#len([i for i in i_adjacencyList if len(i) > 20])

In [27]:
#all(al[1] == 'POI' for al in ti_adjacencyList)

# Build Dateset

In [39]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [40]:
new_sentText = [' '.join(i) for i in sentWords]
sentSeq = tokenizer.texts_to_sequences(new_sentText)
sentData = pad_sequences(sentSeq, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

t_new_sentText = [' '.join(i) for i in t_sentWords]
t_sentSeq = tokenizer.texts_to_sequences(t_new_sentText)
t_sentData = pad_sequences(t_sentSeq, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

token2index = dict((j, i+120) for i, j in enumerate(['PAD']+padDict+t_relationTypes))
token2index['PAD'] = 0
index2token = {i: w for w, i in token2index.items()}
newi_adjacencyList = [[token2index[i] if i in token2index else i+1 for i in aL] for aL in i_adjacencyList]
newi_adjacencyList = pad_sequences(newi_adjacencyList, maxlen=MAX_ADJL_LEN, padding='post', truncating='pre')
newti_adjacencyList = [[token2index[i] if i in token2index else i+1 for i in aL] for aL in ti_adjacencyList]
newti_adjacencyList = pad_sequences(newti_adjacencyList, maxlen=MAX_ADJL_LEN, padding='post', truncating='post')

In [41]:
#np.max([int(i) for aL in i_adjacencyList for i in aL if i not in padDict+t_relationTypes])

In [42]:
# Split the data into a training set, a validation set and a test set
x_train_all = sentData
y_train_all = newi_adjacencyList

x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.15, random_state=SEED)
x_train_all, _, y_train_all, _  = train_test_split(x_train_all, y_train_all, test_size=0., random_state=SEED)
 
x_test = t_sentData
y_test = newti_adjacencyList

x_test, _, y_test, _ = train_test_split(x_test, y_test, test_size=0., random_state=SEED)

# Save Dataset

In [43]:
import h5py
import pickle

In [44]:
fh = h5py.File('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/allData_gla.h5', 'w')
fh['x_train'] = x_train
fh['y_train'] = y_train
fh['x_val'] = x_val
fh['y_val'] = y_val
fh['x_train_all'] = x_train_all
fh['y_train_all'] = y_train_all
fh['x_test'] = x_test
fh['y_test'] = y_test
fh['embedding'] = embedding
fh.close()

In [45]:
with open('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/index_gla.pkl', 'wb') as fp:
    pickle.dump((word2index, index2word, token2index, index2token), fp, -1)

# Checkpoint

In [48]:
import h5py
import pickle

In [49]:
with h5py.File('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/allData_w2v.h5', 'r') as fh:
    x_train = fh['x_train'][:]
    y_train = fh['y_train'][:]
    x_val = fh['x_val'][:]
    y_val = fh['y_val'][:]
    x_train_all = fh['x_train_all'][:]
    y_train_all = fh['y_train_all'][:]
    x_test = fh['x_test'][:]
    y_test = fh['y_test'][:]
    embedding = fh['embedding'][:]

In [50]:
with open('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/index_w2v.pkl', 'rb') as fp:
    word2index, index2word, token2index, index2token = pickle.load(fp)