# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preprocessing</a></div><div class="lev2 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev3 toc-item"><a href="#Load-Train-Data" data-toc-modified-id="Load-Train-Data-111"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Load Train Data</a></div><div class="lev3 toc-item"><a href="#Train-Data-Sample" data-toc-modified-id="Train-Data-Sample-112"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Train Data Sample</a></div><div class="lev3 toc-item"><a href="#Load-Test-Data" data-toc-modified-id="Load-Test-Data-113"><span class="toc-item-num">1.1.3&nbsp;&nbsp;</span>Load Test Data</a></div><div class="lev3 toc-item"><a href="#Relation-Types" data-toc-modified-id="Relation-Types-114"><span class="toc-item-num">1.1.4&nbsp;&nbsp;</span>Relation Types</a></div><div class="lev2 toc-item"><a href="#Participle" data-toc-modified-id="Participle-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Participle</a></div><div class="lev2 toc-item"><a href="#Make-Adjacency-List" data-toc-modified-id="Make-Adjacency-List-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Make Adjacency List</a></div><div class="lev3 toc-item"><a href="#Make-Adjacency-List-of-Train-Data" data-toc-modified-id="Make-Adjacency-List-of-Train-Data-131"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Make Adjacency List of Train Data</a></div><div class="lev3 toc-item"><a href="#Make-Adjacency-List-of-Test-Data" data-toc-modified-id="Make-Adjacency-List-of-Test-Data-132"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Make Adjacency List of Test Data</a></div><div class="lev1 toc-item"><a href="#Build-Dateset" data-toc-modified-id="Build-Dateset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build Dateset</a></div><div class="lev1 toc-item"><a href="#Save-Dataset" data-toc-modified-id="Save-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Save Dataset</a></div><div class="lev1 toc-item"><a href="#Checkpoint" data-toc-modified-id="Checkpoint-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Checkpoint</a></div>

# Data Preprocessing

## Load Data

### Load Train Data

In [1]:
import json

In [2]:
sentText = []
relationMentions = []
relationLabels = []
entityMentions = []
entityLabels = []
em1Text = []
em2Text = []

with open("/Users/lizhn7/Downloads/DATA/nyt/train.json") as f:
    lines = f.readlines()

for line in lines:
    item = json.loads(line)
    # Exclude "None" label
    if not all(i['label'] == 'None' for i in item['relationMentions']):
        sentText.append(item['sentText'])
        relationMentions.append(item['relationMentions'])
        entityMentions.append(item['entityMentions'])
    
relationLabels = [[i['label'].split('/')[-1] for i in rM] for rM in relationMentions]
entityLabels = [[i['text'] for i in eM] for eM in entityMentions]
em1Text = [[i['em1Text'] for i in rM] for rM in relationMentions]
em2Text = [[i['em2Text'] for i in rM] for rM in relationMentions]

In [3]:
# Clean words
replaceDict = {
               'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
               'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
               'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e', 
               'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
               'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u', 'ü': 'u',
               'ñ': 'n',
               'É': 'E'
              }
em1Text = [[''.join([replaceDict.get(i, i) for i in e]) for e in eT] for eT in em1Text]
em2Text = [[''.join([replaceDict.get(i, i) for i in e]) for e in eT] for eT in em2Text]

### Train Data Sample

In [4]:
sentText[0]

'But that spasm of irritation by a master intimidator was minor compared with what Bobby Fischer , the erratic former world chess champion , dished out in March at a news conference in Reykjavik , Iceland .'

In [5]:
relationMentions[0]

[{'em1Text': 'Bobby Fischer',
  'em2Text': 'Iceland',
  'label': '/people/person/nationality'},
 {'em1Text': 'Iceland',
  'em2Text': 'Reykjavik',
  'label': '/location/country/capital'},
 {'em1Text': 'Iceland',
  'em2Text': 'Reykjavik',
  'label': '/location/location/contains'},
 {'em1Text': 'Bobby Fischer',
  'em2Text': 'Reykjavik',
  'label': '/people/deceased_person/place_of_death'}]

In [6]:
relationLabels[0]

['nationality', 'capital', 'contains', 'place_of_death']

In [7]:
entityMentions[0]

[{'label': 'PERSON', 'start': 0, 'text': 'Bobby Fischer'},
 {'label': 'LOCATION', 'start': 1, 'text': 'Reykjavik'},
 {'label': 'LOCATION', 'start': 2, 'text': 'Iceland'}]

In [8]:
entityLabels[0]

['Bobby Fischer', 'Reykjavik', 'Iceland']

In [9]:
em1Text[0]

['Bobby Fischer', 'Iceland', 'Iceland', 'Bobby Fischer']

In [10]:
em2Text[0]

['Iceland', 'Reykjavik', 'Reykjavik', 'Reykjavik']

### Load Test Data

In [11]:
t_sentText = []
t_relationMentions = []
t_relationLabels = []
t_entityMentions = []
t_entityLabels = []
t_em1Text = []
t_em2Text = []

with open("/Users/lizhn7/Downloads/DATA/nyt/test.json") as f:
    lines = f.readlines()

for line in lines:
    item = json.loads(line)
    t_sentText.append(item['sentText'])
    t_relationMentions.append(item['relationMentions'])
    t_entityMentions.append(item['entityMentions'])
    
t_relationLabels = [[i['label'].split('/')[-1] for i in rM] for rM in t_relationMentions]
t_entityLabels = [[i['text'] for i in eM] for eM in t_entityMentions]
t_em1Text = [[i['em1Text'] for i in rM] for rM in t_relationMentions]
t_em2Text = [[i['em2Text'] for i in rM] for rM in t_relationMentions]

In [12]:
# Clean words
t_replaceDict = {
               'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
               'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
               'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e', 
               'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
               'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u', 'ü': 'u',
               'ñ': 'n',
               'É': 'E'
              }
t_em1Text = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_em1Text]
t_em2Text = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_em2Text]
t_entityLabels = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_entityLabels]

###  Relation Types

In [13]:
relationTypes = list(set([r for rl in relationLabels for r in rl]))
relationTypes

['religion',
 'children',
 'major_shareholders',
 'place_lived',
 'major_shareholder_of',
 'advisors',
 'industry',
 'location',
 'founders',
 'ethnicity',
 'company',
 'place_founded',
 'administrative_divisions',
 'contains',
 'profession',
 'capital',
 'neighborhood_of',
 'place_of_death',
 'people',
 'geographic_distribution',
 'nationality',
 'teams',
 'country',
 'place_of_birth']

In [14]:
t_relationTypes = list(set([r for rl in t_relationLabels for r in rl if r != 'None']))
t_relationTypes

['neighborhood_of',
 'place_of_death',
 'nationality',
 'children',
 'contains',
 'founders',
 'place_lived',
 'company',
 'capital',
 'country',
 'administrative_divisions',
 'place_of_birth']

## Participle

In [15]:
from nltk import regexp_tokenize

In [16]:
def cut(s):
    """
    Participle
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\d+(?:\.\d+)?%?       # numbers, incl. currency and percentages 
              |\w+(?:[-&']\w+)*       # words w/ optional internal hyphens/apostrophe  
           '''  
    return regexp_tokenize(s, pattern=pattern)

In [17]:
sentWords = [cut(s) for s in sentText]
entlabWords = [[cut(s) for s in eL] for eL in entityLabels]
em1Words = [[cut(s) for s in eL] for eL in em1Text]
em2Words = [[cut(s) for s in eL] for eL in em2Text]
t_sentWords = [cut(s) for s in t_sentText]
t_entlabWords = [[cut(s) for s in eL] for eL in t_entityLabels]
t_em1Words = [[cut(s) for s in eL] for eL in t_em1Text]
t_em2Words = [[cut(s) for s in eL] for eL in t_em2Text]

In [18]:
#MAX_SENT_LEN = 120
#AX_ADJL_LEN = 20

In [19]:
#len([i for i, j in enumerate(sentWords) if len(j) > 120])
#max(len(j) for i, j in enumerate(t_sentWords) if len(j) > 80)

## Make Adjacency List

### Make Adjacency List of Train Data

In [29]:
from collections import Counter

In [33]:
i_entityLabels = []
for i in range(len(sentWords)):
    eL = []
    sDict = list(enumerate(sentWords[i]))
    j = 0
    for item in entlabWords[i]:
        el = []
        for e in item:
            while j < len(sDict):
                if e == sDict[j][1]:
                    el.append(sDict[j][0])
                    j += 1
                    break
                j += 1
        eL.append(el)
    i_entityLabels.append(eL)
    
i_em1Text = []
for i in range(len(sentWords)):
    temp = list(zip(sum(i_entityLabels[i], []), sum(entlabWords[i], [])))
    eM1 = []
    j = 0
    for ee in em1Words[i]:
        #j = 0
        em1 = []
        for e in ee:
            while j < len(temp):
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] == 1:
                    em1.append(temp[j][0])
                    break
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] > 1:
                    em1.append(temp[j][0])
                    j += 1
                    break
                j += 1
        eM1.append(em1)
    i_em1Text.append(eM1)

i_em2Text = []
for i in range(len(sentWords)):
    temp = list(zip(sum(i_entityLabels[i], []), sum(entlabWords[i], [])))
    eM2 = []
    for ee in em2Words[i]:
        j = 0
        em2 = []
        for e in ee:
            while j < len(temp):
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] == 1:
                    em2.append(temp[j][0])
                    break
                if e == temp[j][1] and Counter(sum(entlabWords[i], []))[e] > 1:
                    em2.append(temp[j][0])
                    j += 1
                    break
                j += 1
        eM2.append(em2)
    i_em2Text.append(eM2)

In [352]:
pad = ['POI', 'RE', 'EOP']
adjacencyList = []
for n in range(len(sentWords)): 
    e1 = []
    e2 = []
    aL = []
    for l in i_trueLables[n]:
        for item in i_entityLabels[n]:
            if ' '.join([sentWords[n][i] for i in item]) == ' '.join(em1Words[n][l]):
                e1.append(item[0])
            if ' '.join([sentWords[n][i] for i in item]) == ' '.join(em2Words[n][l]):
                e2.append(item[0])
        c = [(a, b) for a in e1 for b in e2]
        r = c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]
        aL = aL + [r[0], pad[0], r[1], pad[1], relationLabels[n][l], pad[2]]
    adjacencyList.append(aL)

# Modify 
#adjacencyList[26] = [34, 'POI', 36, 'RE', 'founders', 'EOP']
#adjacencyList[191] = [22, 'POI', 36, 'RE', 'country', 'EOP', 33, 'POI', 36, 'RE', 'country', 'EOP']

In [346]:
adjacencyList = []
sentence = []
new_entityLabels = []
newi_entityLabels = []
padDict = ['POI', 'RE', 'EOP']
for i in range(len(sentWords)):
    # POI means pointer
    # RE means relation label
    # EOP means end of pointer   
    aL = ''
    for j in entityLabels[i]:
        dictEn = dict((b, a) for a, b in enumerate(entityLabels[i]))
        # Exclude further "None" label
        if j in em1Text[i]:#and j in [it['em1Text'] for it in relationMentions[i] if it['label'].split('/')[-1] in t_relationTypes]:
            aL += j
            in_em1Text = [dictEn[item] for item in em1Text[i]]
            in_em2Text = [dictEn[item] for item in em2Text[i]]
            new_em1 = [m[0] for m in sorted(list(zip(in_em1Text, in_em2Text)))]
            new_em2 = [n[1] for n in sorted(list(zip(in_em1Text, in_em2Text)))]
            new_em1 = [entityLabels[i][e1] for e1 in new_em1]
            new_em2 = [entityLabels[i][e2] for e2 in new_em2]
            listRe = list(zip(zip(in_em1Text, in_em2Text), relationLabels[i]))
            new_rel = [r[-1] for r in sorted(listRe, key=lambda z: z[0])]
            for item in enumerate(new_em1):
                if j == item[1] and new_rel[item[0]] in t_relationTypes:
                    aL = aL + ' POI ' + new_em2[item[0]] + ' RE ' + new_rel[item[0]]
            aL += ' EOP '
    #if len(sentWords[i]) > MAX_SENT_LEN: #or len(aL.split()) > MAX_ADJL_LEN:
    #    aL = ''
    if aL != '':
        adjacencyList.append(aL)
        sentence.append(sentWords[i])
        new_entityLabels.append(entlabWords[i])
        newi_entityLabels.append(i_entityLabels[i])

In [347]:
i_adjacencyList = []
for i in range(len(adjacencyList)):
    aL = []
    replace = dict(zip(sum(new_entityLabels[i], []), sum(newi_entityLabels[i], [])))
    aL = [replace[j] if j not in padDict+t_relationTypes else j for j in cut(adjacencyList[i])]
    #aL = [s if type(s) == str else str(s) for s in aL]
    i_adjacencyList.append(aL)

In [168]:
#[i for i in i_adjacencyList if type(i[-2]) == int]

In [24]:
#all(type(al[-2]) == int for al in i_adjacencyList)

In [19]:
#[i for i in i_adjacencyList if i[1] == 'POI']

In [20]:
#[eT[0] for eT in enumerate(em1Text) for i in eT[1] if i == 'Édith Piaf']

In [58]:
aL = []
pad = ['POI', 'RE', 'EOP']
for item1 in i_entityLabels[666]:
    for item2 in i_entityLabels[666]:
        for item3 in range(len(relationLabels[666])):
            if ' '.join([sentWords[666][i] for i in item1]) == em1Text[666][item3] and ' '.join([sentWords[666][i] for i in item2]) == em2Text[666][item3]:
                al = [item1[0], pad[0], item2[0], pad[1], relationLabels[666][item3], pad[2]]
                aL.append(al)

In [65]:
aL = []
pad = ['POI', 'RE', 'EOP']
for item1 in i_entityLabels[666]:
    al = []
    if ' '.join([sentWords[666][i] for i in item1]) in em1Text[666]:
        al += [item1[0]]
        for item2 in i_entityLabels[666]:
            for item3 in range(len(relationLabels[666])):
                if ' '.join([sentWords[666][i] for i in item1]) == em1Text[666][item3] and ' '.join([sentWords[666][i] for i in item2]) == em2Text[666][item3]:
                    al = al + [pad[0], item2[0], pad[1], relationLabels[666][item3]]
        al += [pad[-1]]
        aL.append(al)

In [46]:
aL

[[3, 'POI', 24, 'RE', 'contains', 'EOP'],
 [3, 'POI', 44, 'RE', 'contains', 'EOP'],
 [14, 'POI', 44, 'RE', 'neighborhood_of', 'EOP'],
 [15, 'POI', 20, 'RE', 'neighborhood_of', 'EOP'],
 [18, 'POI', 20, 'RE', 'neighborhood_of', 'EOP'],
 [20, 'POI', 15, 'RE', 'contains', 'EOP'],
 [20, 'POI', 18, 'RE', 'contains', 'EOP'],
 [22, 'POI', 24, 'RE', 'neighborhood_of', 'EOP'],
 [24, 'POI', 22, 'RE', 'contains', 'EOP'],
 [44, 'POI', 14, 'RE', 'contains', 'EOP']]

In [66]:
aL

[[3, 'POI', 24, 'RE', 'contains', 'POI', 44, 'RE', 'contains', 'EOP'],
 [14, 'POI', 44, 'RE', 'neighborhood_of', 'EOP'],
 [15, 'POI', 20, 'RE', 'neighborhood_of', 'EOP'],
 [18, 'POI', 20, 'RE', 'neighborhood_of', 'EOP'],
 [20, 'POI', 15, 'RE', 'contains', 'POI', 18, 'RE', 'contains', 'EOP'],
 [22, 'POI', 24, 'RE', 'neighborhood_of', 'EOP'],
 [24, 'POI', 22, 'RE', 'contains', 'EOP'],
 [44, 'POI', 14, 'RE', 'contains', 'EOP']]

In [71]:
relationLabels[55]

['contains', 'capital', 'contains', 'capital']

In [72]:
sentText[55]

"He writes that the State Department 's Future of Iraq project was sidelined because of tensions between the State Department and the Pentagon , and that its coordinator , Tom Warrick , '' who had done as much thinking about postwar Iraq as any American official '' also '' became a casualty of the interagency war and did n't get to Baghdad for a year . ''"

In [73]:
' '.join(sentWords[55])

"He writes that the State Department s Future of Iraq project was sidelined because of tensions between the State Department and the Pentagon and that its coordinator Tom Warrick who had done as much thinking about postwar Iraq as any American official also became a casualty of the interagency war and did n't get to Baghdad for a year"

In [74]:
entityLabels[55]

['Iraq', 'Iraq', 'Baghdad']

In [126]:
[j for j, i in enumerate(entityLabels) if len(i) != len(set(i))]

[55,
 56,
 58,
 61,
 64,
 84,
 86,
 97,
 117,
 132,
 134,
 145,
 171,
 176,
 189,
 214,
 228,
 237,
 249,
 282,
 303,
 308,
 318,
 328,
 363,
 370,
 381,
 394,
 415,
 418,
 421,
 423,
 427,
 457,
 460,
 473,
 509,
 513,
 526,
 528,
 533,
 535,
 538,
 539,
 567,
 588,
 589,
 592,
 595,
 603,
 622,
 642,
 649,
 650,
 662,
 674,
 681,
 693,
 695,
 700,
 714,
 718,
 724,
 755,
 769,
 809,
 835,
 848,
 865,
 875,
 878,
 905,
 923,
 933,
 953,
 974,
 982,
 1011,
 1012,
 1031,
 1051,
 1064,
 1070,
 1081,
 1091,
 1092,
 1142,
 1159,
 1166,
 1213,
 1253,
 1320,
 1321,
 1337,
 1343,
 1349,
 1350,
 1354,
 1384,
 1400,
 1408,
 1411,
 1437,
 1472,
 1496,
 1497,
 1500,
 1536,
 1559,
 1570,
 1571,
 1574,
 1579,
 1590,
 1605,
 1611,
 1636,
 1638,
 1713,
 1719,
 1720,
 1721,
 1728,
 1729,
 1732,
 1735,
 1745,
 1762,
 1770,
 1780,
 1807,
 1814,
 1819,
 1833,
 1855,
 1869,
 1870,
 1875,
 1880,
 1890,
 1894,
 1896,
 1898,
 1899,
 1900,
 1903,
 1960,
 1962,
 1967,
 1977,
 1993,
 2012,
 2016,
 2020,
 2027,


In [75]:
i_entityLabels[55]

[[9], [37], [55]]

In [78]:
relationLabels[55]

['contains', 'capital', 'contains', 'capital']

In [80]:
em2Text[55]

['Baghdad', 'Baghdad', 'Baghdad', 'Baghdad']

In [79]:
relationMentions[55]

[{'em1Text': 'Iraq',
  'em2Text': 'Baghdad',
  'label': '/location/location/contains'},
 {'em1Text': 'Iraq',
  'em2Text': 'Baghdad',
  'label': '/location/country/capital'},
 {'em1Text': 'Iraq',
  'em2Text': 'Baghdad',
  'label': '/location/location/contains'},
 {'em1Text': 'Iraq',
  'em2Text': 'Baghdad',
  'label': '/location/country/capital'}]

In [96]:
aL = []
pad = ['POI', 'RE', 'EOP']
for item1 in i_entityLabels[55]:
    for item2 in i_entityLabels[55]:
        for item3 in range(len(relationLabels[55])):
            if ' '.join([sentWords[55][i] for i in item1]) == em1Text[55][item3] and ' '.join([sentWords[55][i] for i in item2]) == em2Text[55][item3]:
                al = (item1[0], pad[0], item2[0], pad[1], relationLabels[55][item3], pad[2])
                aL.append(al)

In [95]:
a = [(1), (2), (3), (1)]
set(a)

{1, 2, 3}

In [99]:
list(set(aL))

[(9, 'POI', 55, 'RE', 'capital', 'EOP'),
 (9, 'POI', 55, 'RE', 'contains', 'EOP'),
 (37, 'POI', 55, 'RE', 'capital', 'EOP'),
 (37, 'POI', 55, 'RE', 'contains', 'EOP')]

In [101]:
relationLabels[55]

['contains', 'capital', 'contains', 'capital']

In [102]:
em1Words[55]

[['Iraq'], ['Iraq'], ['Iraq'], ['Iraq']]

In [103]:
i_entityLabels[55]

[[9], [37], [55]]

In [185]:
e1 = []
e2 = []
aL = []
for l in range(len(relationLabels[56])):
    for item in i_entityLabels[56]:
        if ' '.join([sentWords[56][i] for i in item]) == ' '.join(em1Words[56][l]):
            e1.append(item[0])
        if ' '.join([sentWords[56][i] for i in item]) == ' '.join(em2Words[56][l]):
            e2.append(item[0])
    c = [(a, b) for a in e1 for b in e2]
    r = c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]
    al = (r[0], pad[0], r[1], pad[1], relationLabels[56][l], pad[2])
    aL.append(al)
aL = sorted(set(aL),key=aL.index)
aL = [list(i) for i in aL]

In [186]:
aL

[[86, 'POI', 33, 'RE', 'contains', 'EOP']]

In [182]:
aL

[[37, 'POI', 55, 'RE', 'contains', 'EOP'],
 [37, 'POI', 55, 'RE', 'capital', 'EOP']]

In [127]:
entityLabels[56]

['America', 'Iraq', 'George Packer', 'Bush', 'Iraq', 'America', 'Middle East']

In [187]:
i_entityLabels[56]

[[10], [12], [17, 18], [26], [33], [53], [86, 87]]

In [128]:
relationMentions[56]

[{'em1Text': 'Middle East',
  'em2Text': 'Iraq',
  'label': '/location/location/contains'},
 {'em1Text': 'Middle East',
  'em2Text': 'Iraq',
  'label': '/location/location/contains'}]

In [125]:
aL

[[37, 'POI', 55, 'RE', 'contains', 'EOP'],
 [37, 'POI', 55, 'RE', 'capital', 'EOP'],
 [37, 'POI', 55, 'RE', 'contains', 'EOP'],
 [37, 'POI', 55, 'RE', 'capital', 'EOP']]

In [123]:
[al.count(i) for i in aL]

[0, 0, 0, 0]

In [106]:
e1

[9, 37, 9, 37, 9, 37, 9, 37]

In [107]:
e2

[55, 55, 55, 55]

In [108]:
c = [(a, b) for a in e1 for b in e2]

In [110]:
c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]

(37, 55)

In [111]:
aL + [r[0], pad[0], r[1], pad[1], t_relationLabels[n][l], pad[2]]

NameError: name 'r' is not defined

In [183]:
aL = []
pad = ['POI', 'RE', 'EOP']
for item1 in i_entityLabels[56]:
    al = []
    if ' '.join([sentWords[56][i] for i in item1]) in em1Text[56]:
        al += [item1[0]]
        for item2 in i_entityLabels[56]:
            for item3 in range(len(relationLabels[56])):
                if ' '.join([sentWords[56][i] for i in item1]) == em1Text[56][item3] and ' '.join([sentWords[56][i] for i in item2]) == em2Text[56][item3]:
                    al = al + [pad[0], item2[0], pad[1], relationLabels[56][item3]]
        al += [pad[-1]]
        aL.append(al)

In [184]:
aL

[[86,
  'POI',
  12,
  'RE',
  'contains',
  'POI',
  12,
  'RE',
  'contains',
  'POI',
  33,
  'RE',
  'contains',
  'POI',
  33,
  'RE',
  'contains',
  'EOP']]

In [87]:
print(aL)

[[9, 'POI', 55, 'RE', 'contains', 'POI', 55, 'RE', 'capital', 'POI', 55, 'RE', 'contains', 'POI', 55, 'RE', 'capital', 'EOP'], [37, 'POI', 55, 'RE', 'contains', 'POI', 55, 'RE', 'capital', 'POI', 55, 'RE', 'contains', 'POI', 55, 'RE', 'capital', 'EOP']]


In [357]:
print(i_adjacencyList[666f])

[3, 4, 'POI', 24, 'RE', 'contains', 'POI', 44, 'RE', 'contains', 'EOP', 14, 'POI', 44, 'RE', 'neighborhood_of', 'EOP', 20, 16, 'POI', 20, 'RE', 'neighborhood_of', 'EOP', 18, 19, 'POI', 20, 'RE', 'neighborhood_of', 'EOP', 20, 'POI', 20, 16, 'RE', 'contains', 'POI', 18, 19, 'RE', 'contains', 'EOP', 22, 23, 'POI', 24, 'RE', 'neighborhood_of', 'EOP', 24, 'POI', 22, 23, 'RE', 'contains', 'EOP', 44, 'POI', 14, 'RE', 'contains', 'EOP']


In [355]:
print(adjacencyList[666])

[20, 'POI', 15, 'RE', 'contains', 'EOP', 20, 'POI', 22, 'RE', 'contains', 'EOP', 14, 'POI', 15, 'RE', 'neighborhood_of', 'EOP', 14, 'POI', 14, 'RE', 'contains', 'EOP', 14, 'POI', 14, 'RE', 'contains', 'EOP', 14, 'POI', 14, 'RE', 'contains', 'EOP', 24, 'POI', 24, 'RE', 'neighborhood_of', 'EOP', 20, 'POI', 20, 'RE', 'neighborhood_of', 'EOP', 20, 'POI', 20, 'RE', 'contains', 'EOP', 20, 'POI', 20, 'RE', 'neighborhood_of', 'EOP']


In [323]:
relationMentions[666]

[{'em1Text': 'Brooklyn',
  'em2Text': 'Brooklyn Heights',
  'label': '/location/location/contains'},
 {'em1Text': 'Queens',
  'em2Text': 'Rego Park',
  'label': '/location/location/contains'},
 {'em1Text': 'Inwood',
  'em2Text': 'Manhattan',
  'label': '/location/neighborhood/neighborhood_of'},
 {'em1Text': 'Manhattan',
  'em2Text': 'Inwood',
  'label': '/location/location/contains'},
 {'em1Text': 'Brooklyn',
  'em2Text': 'Fort Greene',
  'label': '/location/location/contains'},
 {'em1Text': 'New York',
  'em2Text': 'Manhattan',
  'label': '/location/location/contains'},
 {'em1Text': 'Rego Park',
  'em2Text': 'Queens',
  'label': '/location/neighborhood/neighborhood_of'},
 {'em1Text': 'Brooklyn Heights',
  'em2Text': 'Brooklyn',
  'label': '/location/neighborhood/neighborhood_of'},
 {'em1Text': 'New York',
  'em2Text': 'Queens',
  'label': '/location/location/contains'},
 {'em1Text': 'Fort Greene',
  'em2Text': 'Brooklyn',
  'label': '/location/neighborhood/neighborhood_of'}]

### Make Adjacency List of Test Data

In [301]:
# Clean words
t_sentWords = [[''.join([t_replaceDict.get(i, i) for i in e]) for e in eT] for eT in t_sentWords]

In [302]:
ti_entityLabels = []
for i in range(len(t_sentWords)):
    eL = []
    t_sDict = list(enumerate(t_sentWords[i]))
    j = 0
    for item in t_entlabWords[i]:
        el = []
        for e in item:
            while j < len(t_sDict):
                if e == t_sDict[j][1]:
                    el.append(t_sDict[j][0])
                    j += 1
                    break
                j += 1
        eL.append(el)
    ti_entityLabels.append(eL)

In [303]:
ti_trueLables = [[i for i, j in enumerate(rl) if j != 'None'] for rl in t_relationLabels]
t_trueMentions = [[j for i, j in enumerate(rl) if j['label'] != 'None'] for rl in t_relationMentions]
pad = ['POI', 'RE', 'EOP']
t_adjacencyList = []
for n in range(len(t_sentWords)): 
    e1 = []
    e2 = []
    aL = []
    for l in ti_trueLables[n]:
        for item in ti_entityLabels[n]:
            if ' '.join([t_sentWords[n][i] for i in item]) == ' '.join(t_em1Words[n][l]):
                e1.append(item[0])
            if ' '.join([t_sentWords[n][i] for i in item]) == ' '.join(t_em2Words[n][l]):
                e2.append(item[0])
    c = [(a, b) for a in e1 for b in e2]
    r = c[min([(i, abs(j[0]-j[1])) for i, j in enumerate(c)], key=lambda x: x[-1])[0]]
    aL = aL + [r[0], pad[0], r[1], pad[1], t_relationLabels[n][l], pad[2]]
    t_adjacencyList.append(aL)

# Modify 
t_adjacencyList[26] = [34, 'POI', 36, 'RE', 'founders', 'EOP']
t_adjacencyList[191] = [22, 'POI', 36, 'RE', 'country', 'EOP', 33, 'POI', 36, 'RE', 'country', 'EOP']

In [336]:
set([i[4] for i in t_adjacencyList])

{'administrative_divisions',
 'capital',
 'children',
 'company',
 'contains',
 'country',
 'founders',
 'nationality',
 'neighborhood_of',
 'place_lived',
 'place_of_birth',
 'place_of_death'}

In [372]:
t_adjacencyList.index([20, 'POI', 32, 'RE', 'administrative_divisions', 'EOP'])

271

In [373]:
t_relationMentions[271]

[{'em1Text': 'Giorgio', 'em2Text': 'Jalisco', 'label': 'None'},
 {'em1Text': 'Jalisco', 'em2Text': 'Giorgio', 'label': 'None'},
 {'em1Text': 'Giorgio', 'em2Text': 'Mexico', 'label': 'None'},
 {'em1Text': 'Mexico', 'em2Text': 'Giorgio', 'label': 'None'},
 {'em1Text': 'Mexico',
  'em2Text': 'Jalisco',
  'label': '/location/country/administrative_divisions'},
 {'em1Text': 'Jalisco', 'em2Text': 'Mexico', 'label': 'None'}]

In [332]:
t_adjacencyList

[[1, 'POI', 4, 'RE', 'place_lived', 'EOP'],
 [27, 'POI', 28, 'RE', 'capital', 'EOP'],
 [11, 'POI', 15, 'RE', 'place_lived', 'EOP'],
 [31, 'POI', 34, 'RE', 'contains', 'EOP'],
 [22, 'POI', 7, 'RE', 'country', 'EOP'],
 [17, 'POI', 15, 'RE', 'contains', 'EOP'],
 [4, 'POI', 10, 'RE', 'company', 'EOP'],
 [48, 'POI', 46, 'RE', 'contains', 'EOP'],
 [29, 'POI', 25, 'RE', 'contains', 'EOP'],
 [25, 'POI', 28, 'RE', 'company', 'EOP'],
 [34, 'POI', 32, 'RE', 'country', 'EOP'],
 [25, 'POI', 21, 'RE', 'contains', 'EOP'],
 [31, 'POI', 10, 'RE', 'country', 'EOP'],
 [12, 'POI', 13, 'RE', 'country', 'EOP'],
 [18, 'POI', 14, 'RE', 'contains', 'EOP'],
 [12, 'POI', 15, 'RE', 'nationality', 'EOP'],
 [50, 'POI', 48, 'RE', 'contains', 'EOP'],
 [37, 'POI', 18, 'RE', 'country', 'EOP'],
 [18, 'POI', 0, 'RE', 'contains', 'EOP'],
 [39, 'POI', 13, 'RE', 'country', 'EOP'],
 [11, 'POI', 7, 'RE', 'children', 'EOP'],
 [57, 'POI', 15, 'RE', 'country', 'EOP'],
 [13, 'POI', 21, 'RE', 'country', 'EOP'],
 [2, 'POI', 29, 'RE

In [297]:
t_trueMentions[365]

[{'em1Text': 'Ehud Olmert',
  'em2Text': 'Israel',
  'label': '/people/person/nationality'},
 {'em1Text': 'Ehud Olmert',
  'em2Text': 'Israel',
  'label': '/people/person/nationality'}]

In [292]:
t_relationMentions[38]

[{'em1Text': 'Condoleezza Rice', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Iran', 'label': 'None'},
 {'em1Text': 'Iran', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Russia',
  'em2Text': 'Iran',
  'label': '/location/administrative_division/country'},
 {'em1Text': 'Iran', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Condoleezza Rice', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Condoleezza Rice', 'em2Text': 'Iran', 'label': 'None'},
 {'em1Text': 'Iran', 'em2Text': 'Condoleezza Rice', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Berlin', 'label': 'None'},
 {'em1Text': 'Berlin', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Condolee

In [286]:
' '.join(t_sentWords[38])

'The administration is now put in a position of playing defense as we are finally seeing the international consequences of the rather dramatic internal transformation inside Russia with the erosion of democracy a new ruling class a massive transfer of property rights from so-called oligarchs to basically friends of Putin most of whom are from the old K.G.B. Senior administration officials said the new effort to reach out to Russia already had been put into action and included Secretary of State Condoleezza Rice s consultation with her Russian counterpart Foreign Minister Sergey V. Lavrov during meetings in Berlin on Feb 21 and 22 about whether Russia had the appetite to pursue a second Security Council resolution for sanctions against Iran'

In [287]:
t_sentText[38]

'"The administration is now put in a position of playing defense , as we are finally seeing the international consequences of the rather dramatic internal transformation inside Russia with the erosion of democracy , a new ruling class , a massive transfer of property rights from so-called oligarchs to , basically , friends of Putin , most of whom are from the old K.G.B. \'\' Senior administration officials said the new effort to reach out to Russia already had been put into action , and included Secretary of State Condoleezza Rice \'s consultation with her Russian counterpart , Foreign Minister Sergey V. Lavrov , during meetings in Berlin on Feb. 21 and 22 about whether Russia had the appetite to pursue a second Security Council resolution for sanctions against Iran ."\r\n'

In [271]:
t_adjacencyList[38]

[105, 'POI', 119, 'RE', 'country', 'EOP']

In [272]:
t_entityLabels[38]

['Russia', 'Russia', 'Condoleezza Rice', 'Berlin', 'Russia', 'Iran']

In [273]:
ti_entityLabels[38]

[[26], [69], [81, 82], [97], [105], [119]]

In [282]:
t_relationMentions[191]

[{'em1Text': 'Czech Republic', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Czech Republic', 'label': 'None'},
 {'em1Text': 'Poland', 'em2Text': 'Iran', 'label': 'None'},
 {'em1Text': 'Iran', 'em2Text': 'Poland', 'label': 'None'},
 {'em1Text': 'Czech Republic', 'em2Text': 'Poland', 'label': 'None'},
 {'em1Text': 'Poland', 'em2Text': 'Czech Republic', 'label': 'None'},
 {'em1Text': 'United States', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'United States', 'label': 'None'},
 {'em1Text': 'Poland', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Poland', 'label': 'None'},
 {'em1Text': 'United States', 'em2Text': 'Czech Republic', 'label': 'None'},
 {'em1Text': 'Czech Republic', 'em2Text': 'United States', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Washington', 'label': 'None'},
 {'em1Text': 'Washington', 'em2Text': 'Russia', 'label': 'None'},
 {'em1Text': 'Russia', 'em2Text': 'Russia', 'label':

In [180]:
[i for i in enumerate(ti_trueLables) if len(i[-1]) > 1]

[(26, [2, 5]),
 (30, [7, 16]),
 (47, [2, 3]),
 (52, [5, 8]),
 (73, [13, 24]),
 (172, [3, 6]),
 (191, [15, 21]),
 (236, [0, 5]),
 (262, [1, 3]),
 (268, [4, 9]),
 (293, [2, 5]),
 (313, [5, 10]),
 (316, [13, 16]),
 (347, [3, 16]),
 (365, [3, 10])]

In [281]:
t_trueLables = [[i for i in rl if i != 'None'] for rl in t_relationLabels]

In [305]:
len([i for item in t_relationMentions for i in item if i['label'] != 'None'])

410

In [306]:
set([len(i) for i in t_adjacencyList])

{6, 12}

In [307]:
import numpy as np
np.max([len(i) for i in t_adjacencyList])

12

In [26]:
#len([i for i in i_adjacencyList if len(i) > 20])

In [308]:
all(al[1] == 'POI' for al in t_adjacencyList)

True

# Build Dateset

In [361]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from nltk import WordNetLemmatizer

In [364]:
wnl = WordNetLemmatizer()
sentWords = [[wnl.lemmatize(t.lower()) for t in toks] for toks in sentWords]
t_sentWords = [[wnl.lemmatize(t.lower()) for t in toks] for toks in t_sentWords]
tok_sentWords = sentWords.copy()
tok_sentWords.extend(t_sentWords)
tokTexts = [' '.join(i) for i in tok_sentWords]

In [365]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
print('Found %s unique tokens.' % len(word2index))

Found 71038 unique tokens.


In [366]:
#new_sentText = [' '.join(i) for i in sentWords]
#sentSeq = tokenizer.texts_to_sequences(new_sentText)
#sentData = pad_sequences(sentSeq, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

#t_new_sentText = [' '.join(i) for i in t_sentWords]
#t_sentSeq = tokenizer.texts_to_sequences(t_new_sentText)
#t_sentData = pad_sequences(t_sentSeq, maxlen=120, padding='post', truncating='post')

token2index = dict((j, i+120) for i, j in enumerate(['PAD']+padDict+t_relationTypes))
token2index['PAD'] = 0
index2token = {i: w for w, i in token2index.items()}
#newi_adjacencyList = [[token2index[i] if i in token2index else i+1 for i in aL] for aL in i_adjacencyList]
#newi_adjacencyList = pad_sequences(newi_adjacencyList, maxlen=MAX_ADJL_LEN, padding='post', truncating='pre')
newti_adjacencyList = [[token2index[i] if i in token2index else i+1 for i in aL] for aL in t_adjacencyList]
newti_adjacencyList = pad_sequences(newti_adjacencyList, maxlen=20, padding='post', truncating='post')

In [368]:
SEED = 42

In [369]:
y_test = newti_adjacencyList

y_test, _, _, _ = train_test_split(y_test, y_test, test_size=0., random_state=SEED)

In [41]:
#np.max([int(i) for aL in i_adjacencyList for i in aL if i not in padDict+t_relationTypes])

In [42]:
# Split the data into a training set, a validation set and a test set
x_train_all = sentData
y_train_all = newi_adjacencyList

x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.15, random_state=SEED)
x_train_all, _, y_train_all, _  = train_test_split(x_train_all, y_train_all, test_size=0., random_state=SEED)
 
x_test = t_sentData
y_test = newti_adjacencyList

x_test, _, y_test, _ = train_test_split(x_test, y_test, test_size=0., random_state=SEED)

# Save Dataset

In [370]:
import h5py
import pickle

In [44]:
fh = h5py.File('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/allData_gla.h5', 'w')
fh['x_train'] = x_train
fh['y_train'] = y_train
fh['x_val'] = x_val
fh['y_val'] = y_val
fh['x_train_all'] = x_train_all
fh['y_train_all'] = y_train_all
fh['x_test'] = x_test
fh['y_test'] = y_test
fh['embedding'] = embedding
fh.close()

In [45]:
with open('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/index_gla.pkl', 'wb') as fp:
    pickle.dump((word2index, index2word, token2index, index2token), fp, -1)

In [371]:
with open('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/newy.pkl', 'wb') as fp:
    pickle.dump((y_test), fp, -1)

# Checkpoint

In [48]:
import h5py
import pickle

In [49]:
with h5py.File('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/allData_w2v.h5', 'r') as fh:
    x_train = fh['x_train'][:]
    y_train = fh['y_train'][:]
    x_val = fh['x_val'][:]
    y_val = fh['y_val'][:]
    x_train_all = fh['x_train_all'][:]
    y_train_all = fh['y_train_all'][:]
    x_test = fh['x_test'][:]
    y_test = fh['y_test'][:]
    embedding = fh['embedding'][:]

In [50]:
with open('/Users/lizhn7/Downloads/DATA/nyt/experiment_3_2/index_w2v.pkl', 'rb') as fp:
    word2index, index2word, token2index, index2token = pickle.load(fp)