In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
dataset = open("dataset-nyt-nobel2020.txt")
content = dataset.read()

In [3]:
# First run spacy; Some entities are not identified.
doc = nlp(content)
for ent in doc.ents:
    print(ent.text, ent.label_,)

Nobel Prize Winners WORK_OF_ART
October DATE
Sweden GPE
Norway GPE
last week DATE
Physiology or Medicine WORK_OF_ART
Monday DATE
Alfred Nobel PERSON
The Nobel Prizes WORK_OF_ART
most years DATE
Stockholm GPE
Oslo GPE
December DATE
Stockholm GPE
Nobelists NORP
2021 DATE
Oslo GPE
most years DATE
Nobel WORK_OF_ART
last month DATE
10 million CARDINAL
Swedish NORP
1 million CARDINAL
the previous year DATE
about $112,000 MONEY
2020 DATE
Harvey J. Alter PERSON
Michael Houghton PERSON
Charles M. Rice PERSON
Monday DATE
Nobel WORK_OF_ART
three CARDINAL
millions CARDINAL
Roger Penrose PERSON
Reinhard Genzel PERSON
Andrea Ghez PERSON
the Nobel Prize in Physics WORK_OF_ART
2020.Credit CARDINAL
Fredrik Sandberg PERSON
Roger Penrose PERSON
Reinhard Genzel PERSON
Andrea Ghez PERSON
Tuesday DATE
The Nobel Prize WORK_OF_ART
Chemistry GPE
Wednesday DATE
Emmanuelle Charpentier PERSON
Jennifer A. Doudna PERSON
Crispr-Cas9 ORG
The Nobel Prize WORK_OF_ART
Thursday DATE
Louise Glück PERSON
one CARDINAL
Ameri

In [4]:
# Take a close look at the entity type
# Type WORK_OF_ART for prize categories
for ent in doc.ents:
    if (ent.label_ == "WORK_OF_ART"):
        print(ent.text, ent.label_,)
print("")

# Type PERSON for winners
for ent in doc.ents:
    if (ent.label_ == "PERSON"):
        print(ent.text, ent.label_,)
print("")

# Type DATE for annoucement dates
for ent in doc.ents:
    if (ent.label_ == "DATE"):
        print(ent.text, ent.label_,)

Nobel Prize Winners WORK_OF_ART
Physiology or Medicine WORK_OF_ART
The Nobel Prizes WORK_OF_ART
Nobel WORK_OF_ART
Nobel WORK_OF_ART
the Nobel Prize in Physics WORK_OF_ART
The Nobel Prize WORK_OF_ART
The Nobel Prize WORK_OF_ART
The Nobel Peace Prize WORK_OF_ART
the Nobel WORK_OF_ART

Alfred Nobel PERSON
Harvey J. Alter PERSON
Michael Houghton PERSON
Charles M. Rice PERSON
Roger Penrose PERSON
Reinhard Genzel PERSON
Andrea Ghez PERSON
Fredrik Sandberg PERSON
Roger Penrose PERSON
Reinhard Genzel PERSON
Andrea Ghez PERSON
Emmanuelle Charpentier PERSON
Jennifer A. Doudna PERSON
Louise Glück PERSON
Paul R. Milgrom PERSON
Robert B. Wilson PERSON

October DATE
last week DATE
Monday DATE
most years DATE
December DATE
2021 DATE
most years DATE
last month DATE
the previous year DATE
2020 DATE
Monday DATE
Tuesday DATE
Wednesday DATE
Thursday DATE
Friday DATE
Monday DATE


In [5]:
# The winners and annoucement dates are all extracted correctly.
# We can use these for person entity and time entity.
# But the Nobel prize categories are not correctly extracted by Spacy.

In [6]:
# However, the Nobel Prize categories are preset, well known and not changed. We can setup rule to match 
# them in the text to help split text for event extraction and linking for every prize
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
terms = ["Physiology or Medicine", "Physics", "Chemistry", "Literature", "Peace Prize", "Economic Science"]

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("Match_By_Phrase", None, *patterns)
matches = matcher(doc)

prizeType = []  # store the prize category labels that have shown in the text
prizeIndex = []  # store the corresponding position where each label starts; use for text separation

for match_id, start, end in matches:
    span = doc[start:end]
    prizeType.append(span.text)
    prizeIndex.append(start)

In [7]:
# Store splitted text in a list
docPiece = []
for i in range(len(prizeIndex)-1):
    docPiece.append(doc[prizeIndex[i]+1:prizeIndex[i+1]]) 
docPiece.append(doc[prizeIndex[len(prizeIndex)-1]+1:])

In [8]:
# Take a example look at the POS for one piece of text. We can use POS tags for event and entity linking
for tok in docPiece[5]:
    print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

was --> auxpass --> AUX
jointly --> advmod --> ADV
awarded --> ROOT --> VERB
on --> prep --> ADP
Wednesday --> pobj --> PROPN
to --> dative --> ADP
Emmanuelle --> compound --> PROPN
Charpentier --> pobj --> PROPN
and --> cc --> CCONJ
Jennifer --> compound --> PROPN
A. --> compound --> PROPN
Doudna --> conj --> PROPN
for --> prep --> ADP
their --> poss --> DET
work --> pobj --> NOUN
on --> prep --> ADP
the --> det --> DET
development --> pobj --> NOUN
of --> prep --> ADP
Crispr --> compound --> PROPN
- --> punct --> PUNCT
Cas9 --> pobj --> PROPN
, --> punct --> PUNCT
a --> det --> DET
method --> appos --> NOUN
for --> prep --> ADP
genome --> amod --> ADJ
editing --> pobj --> NOUN
. --> punct --> PUNCT

 -->  --> SPACE


In [9]:
# Use POS tags for entity and relation extraction
# For this helper function, got the idea from the following website
# https://www.analyticsvidhya.com/blog/2019/09/introduction-information-extraction-python-spacy/

def sub_matcher(doc):
 
    person = []
    aux = ''
    verb = ''
    prize = ''
    time = ''
    
    for ent in doc.ents:
        if ent.label_ == 'PERSON' or ent.label_ == "ORG":   # use spacy to get prize recipients
            if "-" not in ent.text:
                person.append(ent.text)
        if ent.label_ == 'DATE':    # use spacy get event date
            if ent.text.isdigit() or ' ' in ent.text:
                continue
            time = ent.text
            
    for i,tok in enumerate(doc):    # use POS tags to get relations in event
            if tok.pos_ == "AUX":
                if verb:
                    continue
                elif not aux:
                    aux = tok.text
                else:
                    continue
               
            if tok.pos_ == "VERB":
                if not verb:
                    verb = tok.text
                else:
                    continue

            if tok.dep_.endswith("dobj") == True:
                if aux:
                    continue
                elif not prize:
                    prize = tok.text
                else:
                    continue

    # combine all prize recipients togother for later output purpose
    personcombined = ''    
    for i in range(len(person)):
        if len(person) == 1:
            personcombined = person[i]
        elif i != len(person)-1:
            personcombined += person[i] + ', '
        else:
            personcombined += 'and ' + person[i]
    
    return personcombined, aux, verb, prize, time

In [10]:
# Event extraction for every prize from every piece of text in docPiece
# Output the not empty events and its components for every prize as
# Prize name:
# Event<label, space, time>
eventList = []
for i in range(len(docPiece)):
    eventList.append(sub_matcher(docPiece[i]))
    if eventList[i][4]:
        label = eventList[i][0] + ' ' + eventList[i][1] + ' ' + eventList[i][2] + ' ' + eventList[i][3]
        space = '_'
        time = eventList[i][4]
        print(prizeType[i] + ':')
        print('Event=<' + label + ', ' + space + ' , ' + time + '>')


Physiology or Medicine:
Event=<Alfred Nobel  wrapped approaches, _ , December>
Physiology or Medicine:
Event=<Harvey J. Alter, Michael Houghton, and Charles M. Rice  received prize, _ , Monday>
Physics:
Event=<Fredrik Sandberg, Roger Penrose, Reinhard Genzel, and Andrea Ghez  received prize, _ , Tuesday>
Chemistry:
Event=<Emmanuelle Charpentier, and Jennifer A. Doudna was awarded , _ , Wednesday>
Literature:
Event=<Louise Glück was awarded , _ , Thursday>
Peace Prize:
Event=<the World Food Program was awarded , _ , Friday>
Economic Science:
Event=<Paul R. Milgrom, and Robert B. Wilson were awarded , _ , Monday>


In [11]:
# For the first item of event output, the recall is 0. 
# Because it falsely linked the wrong person and data to Physiology as well.

# For the rest items of event output, the recall is 1. 
# Because all winners and data are linked together correctly for certain prize.