In [1]:
import re
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
# from benepar.spacy_plugin import BeneparComponent

nlp = spacy.load('en_core_web_md')

mhp_pattern = {"label": "ORG", "pattern": "the Manhattan Project"}
univ_pattern1 = {
    "label": "ORG",
    "pattern": [{ 'IS_TITLE': True, 'OP': '+' },
                { 'ORTH': 'University'}]
}
univ_pattern2 = {
    "label": "ORG",
    "pattern": [
        { 'ORTH': 'University' },
        { 'ORTH': 'of' },
        { 'IS_TITLE': True, 'OP': '+' }]
}

names = [{'label': 'PERSON', 'pattern': p } for p in ['Feynman', 'Bohr']]

patterns = [mhp_pattern, univ_pattern1, univ_pattern2] + names

ruler = EntityRuler(nlp, validate=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler, before="ner")
nlp.add_pipe(nlp.create_pipe("merge_entities"), after="ner") # merge_noun_chunks
# nlp.add_pipe(BeneparComponent('benepar_en'))

  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,


In [2]:
def test(fn, expected, txt):    
    try:
        # result = fn(nlp(txt))
        result = fn(txt)
        assert expected == result
    except AssertionError as e:
        print('Failed: ', txt)
        print('Expected: ', expected)
        print('Result: ', expected)

In [3]:
txt = 'Louis Alexander Slotin (1 December 1910 – 30 May 1946) was a Canadian physicist and chemist who took part in the Manhattan Project.'
doc = nlp(txt)
spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [30]:
txt = "When Richard was five his mother gave birth to a younger brother, Henry Phillips, who died at age four weeks. Four years later, Richard's sister Joan was born and the family moved to Far Rockaway, Queens."
doc = nlp(txt)
spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [38]:
case1 = 'Oppenheimer was born in New York City on April 22, 1904, to Julius Oppenheimer, a wealthy Jewish textile importer who had immigrated to the United States from Germany in 1888, and Ella Friedman, a painter.'
case2 = 'Enrico Fermi was born in Rome, Italy, on 29 September 1901.'
case3 = 'Von Neumann was born in Budapest, Kingdom of Hungary, which was then part of the Austro-Hungarian Empire.'
case4 = 'Bethe was born in Strasbourg, which was then part of Germany, on July 2, 1906, the only child of Anna (née Kuhn) and Albrecht Bethe, a privatdozent of physiology at the University of Strasbourg.'
case5 = 'Bohr was born in Copenhagen, Denmark, on 7 October 1885'
case6 = 'Albert Einstein was born in Ulm, in the Kingdom of Württemberg in the German Empire, on 14 March 1879.'
case7 = "When Richard was five his mother gave birth to a younger brother, Henry Phillips, who died at age four weeks. Four years later, Richard's sister Joan was born and the family moved to Far Rockaway, Queens."

def extract_birthdate(text):
    if not re.search('born', text): return None
    doc = nlp(text)
    
    for s in doc.sents:
    
        subj = next((t for t in s if t.dep_ == 'nsubjpass'), None)
        
        if not subj: return None

        if subj.lower_ in ['she', 'he'] or subj.tag_ == 'NNP':
            date = next((e for e in s.ents if e.label_ == 'DATE'), None)
            return date

extract_birthdate(case6)

14 March 1879

In [6]:
def extract_birthplace(text):
    if not re.search('born', text): return None
    
    # print(text)

    doc = nlp(text)
    
    for s in doc.sents:
        places = list(filter(lambda e: e.label_ == 'GPE', s.ents))
        
        for p in places:      
            head = p.root.head

            while not head.dep_ == 'ROOT':
                head = head.head
                if head.text == 'born': return p.text


extract_birthplace("Haroutune Krikor Daghlian Jr., of Armenian-American descent, was born in Waterbury, Connecticut, on May 4, 1921, one of three children of Margaret Rose (born as Currie) and Haroutune Krikor Daghlian.")

'Waterbury'

In [7]:
# Cancer

def extract_cancer_fact(text):
    # Cases:
    # - diagnosed with * cancer
    # - died of cancer
    # - death from cancer
    
    # QUESTION: How would comp linguist handle this case?
    # CASE: "Feynman PROPN was AUX diagnosed VERB with ADP liposarcoma, PROPN a DET rare ADJ form NOUN of ADP cancer."
    
    if not re.search('cancer', text): return None
    
    doc = nlp(text)
   
    for w in doc:
        if w.text == 'of' and w.head.text == 'died' or \
           w.text == 'from' and w.head.text == 'death' or \
           w.text == 'with' and w.head.text == 'diagnosed':
            return 'cancer' in [t.text for t in w.subtree]

    return False


test(extract_cancer_fact, True, 'Oppenheimer was a chain smoker who was diagnosed with throat cancer in late 1965.')
test(extract_cancer_fact, True, 'Fifty days later he died of stomach cancer at age 53 in his home in Chicago.')
test(extract_cancer_fact, True, "Shortly before his death from cancer, von Neumann headed the United States government's top secret ICBM committee, which would sometimes meet in his home. Its purpose was to decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon.")
test(extract_cancer_fact, True, 'In 1955, von Neumann was diagnosed with what was either bone, pancreatic or prostate cancer.')
test(extract_cancer_fact, True, 'In 1978, Feynman sought medical treatment for abdominal pains and was diagnosed with liposarcoma, a rare form of cancer. Surgeons removed a tumor the size of a football that had crushed one kidney and his spleen.')
test(extract_cancer_fact, False, 'In astrology, cancer is the cardinal sign of the Water trigon... Who cares?')

In [8]:
# Nobel Prize Award

def extract_nobel_prize_fact(text):
    # Cases:
    # - was awarded the 1938 Nobel Prize in Physics
    # - won the 1967 Nobel Prize in Physics  
    # - He received the 1921 Nobel Prize in Physics
    # - Feynman received the Nobel Prize in Physics in 1965 
    
    if not re.search('Nobel Prize', text): return None
    
    doc = nlp(text)
    
    # print(s, s.ents)
    prize = next((e for e in doc.ents if re.search('Nobel Prize', e.text)), None)
    # TODO: try to use syns from WordNet
    result = prize.root.head.text in ['received', 'won', 'awarded']
    
    return result

test(extract_nobel_prize_fact, True, 'Fermi held several patents related to the use of nuclear power, and was awarded the 1938 Nobel Prize in Physics for his work on induced radioactivity by neutron bombardment and for the discovery of transuranium elements.')
test(extract_nobel_prize_fact, True, 'Hans Albrecht Bethe (German: [ˈhans ˈalbʁɛçt ˈbeːtə]; July 2, 1906 – March 6, 2005) was a German-American nuclear physicist who made important contributions to astrophysics, quantum electrodynamics and solid-state physics, and won the 1967 Nobel Prize in Physics for his work on the theory of stellar nucleosynthesis')
test(extract_nobel_prize_fact, True, 'Niels Henrik David Bohr (Danish: [nels ˈpoɐ̯ˀ]; 7 October 1885 – 18 November 1962) was a Danish physicist who made foundational contributions to understanding atomic structure and quantum theory, for which he received the Nobel Prize in Physics in 1922.')
test(extract_nobel_prize_fact, True, 'He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory.')
test(extract_nobel_prize_fact, True, "For contributions to the development of quantum electrodynamics, Feynman received the Nobel Prize in Physics in 1965 jointly with Julian Schwinger and Shin'ichirō Tomonaga.")
# TODO: [FALSE-POSITIVE] (J._Robert_Oppenheimer) Two years later, Carl David Anderson discovered the positron, for which he received the 1936 Nobel Prize in Physics.

In [9]:
# MHP

def extract_mhp_involvement(text):
    # Cases:
    # - When he joined the Manhattan Project in 1942,
    # - where he worked on the Manhattan Project during World War II
    # - During World War II, von Neumann worked on the Manhattan Project with theoretical physicist Edward Teller
    # - which in turn led to his involvement in the Manhattan Project
    # - Bethe's work at Los Alamos included calculating the critical mass and efficiency of uranium-235 and the multiplication of nuclear fission in an exploding atomic bomb.
    
    if not re.search(r'(Los Alamos)|(Manhattan Project(\'s)?)', text): return None
    
    doc = nlp(text)
    
    project = next((e for e in doc.ents if re.search('Manhattan Project', e.text)), None)
    
    # print(list(doc.ents))
    # print(project)

    if project:
        prev = project.root.head
        
        if prev.text in ['joined', 'worked']: return True
        
        if prev.dep_ == 'prep' or 'laboratory' in prev.lower_:
            verbs = ['work', 'serve', 'take', 'participate', 'associate', 'invite', 'participate', 'join']
            nouns = ['career', 'work', 'involvement', 'participant', 'director', 'leader', 'role', 'position']
            
            if prev.head.lemma_ in verbs + nouns: return True
        

    los_alamos = next((e for e in doc.ents if re.search('Los Alamos', e.text)), None)
    
    if los_alamos and los_alamos.root.head.dep_ == 'prep':
        prep = los_alamos.root.head

        if prep.head.text in ['work', 'remain']: return True

    return False

In [10]:
txt = "A graduate of the University of Birmingham, Titterton worked in a research position under Mark Oliphant, who recruited him to work on radar for the British Admiralty during the first part of the Second World War. In 1943, he joined the Manhattan Project's Los Alamos Laboratory, where he helped develop the first atomic bombs. "
extract_mhp_involvement(txt)

True

In [11]:
test(extract_mhp_involvement, True, 'He emigrated to the United States, where he worked on the Manhattan Project during World War II.')
test(extract_mhp_involvement, True, 'Louis Alexander Slotin (1 December 1910 – 30 May 1946) was a Canadian physicist and chemist who took part in the Manhattan Project.')
test(extract_mhp_involvement, True, 'In 1942, he was invited to participate in the Manhattan Project.')
test(extract_mhp_involvement, True, 'Arnold Kramish (June 6, 1923  – June 15, 2010) was an American nuclear physicist and author who was associated with the Manhattan Project.')
test(extract_mhp_involvement, True, 'Robert Rathbun Wilson (March 4, 1914 – January 16, 2000) was an American physicist known for his work on the Manhattan Project during World War II, as a sculptor, and as an architect of the Fermi National Accelerator Laboratory (Fermilab), where he was the first director from 1967 to 1978.')
test(extract_mhp_involvement, True, "A graduate of the University of Birmingham, Titterton worked in a research position under Mark Oliphant, who recruited him to work on radar for the British Admiralty during the first part of the Second World War. In 1943, he joined the Manhattan Project's Los Alamos Laboratory, where he helped develop the first atomic bombs. ")
test(extract_mhp_involvement, True, 'When he joined the Manhattan Project in 1942, Oppenheimer wrote on his personal security questionnaire that he [Oppenheimer] had been "a member of just about every Communist Front organization on the West Coast".')
test(extract_mhp_involvement, True, 'He emigrated to the United States, where he worked on the Manhattan Project during World War II.')
test(extract_mhp_involvement, True, "During World War II, von Neumann worked on the Manhattan Project with theoretical physicist Edward Teller, mathematician Stanisław Ulam and others, problem solving key steps in the nuclear physics involved in thermonuclear reactions and the hydrogen bomb.")
test(extract_mhp_involvement, True, "This led him to a large number of military consultancies, primarily for the Navy, which in turn led to his involvement in the Manhattan Project.")
test(extract_mhp_involvement, True, "Bethe's work at Los Alamos included calculating the critical mass and efficiency of uranium-235 and the multiplication of nuclear fission in an exploding atomic bomb.")
test(extract_mhp_involvement, True, "Bohr did not remain at Los Alamos, but paid a series of extended visits over the course of the next two years.")
test(extract_mhp_involvement, True, 'Feynman nominally held an appointment at the University of Wisconsin–Madison as an assistant professor of physics, but was on unpaid leave during his involvement in the Manhattan Project.')
test(extract_mhp_involvement, True, "He played a number of key roles in the early development of nuclear energy, as a participant in the Manhattan Project, a member of the U.S. Atomic Energy Commission (AEC), and U.S. ambassador to the International Atomic Energy Agency (IAEA).")
# TODO: "From there, he was flown to Britain, where he joined the British Tube Alloys nuclear weapons project, and was part of the British mission to the Manhattan Project."
# TODO: 'After the attack on Pearl Harbor had brought the United States into the war, Feynman was recruited by Robert R. Wilson, who was working on means to produce enriched uranium for use in an atomic bomb, as part of what would become the Manhattan Project.'
# TODO: "Some say that as a result of Einstein's letter and his meetings with Roosevelt, the US entered the \"race\" to develop the bomb, drawing on its \"immense material, financial, and scientific resources\" to initiate the Manhattan Project."
# TODO: test_mhp_involvement(True, "In June 1942, the US Army established the Manhattan Project to handle its part in the atom bomb project and began the process of transferring responsibility from the Office of Scientific Research and Development to the military. In September, Groves was appointed director of what became known as the Manhattan Project. He selected Oppenheimer to head the project's secret weapons laboratory.")
# TODO: "Joseph William Kennedy (May 30, 1916 – May 5, 1957) was an American chemist who was a co-discoverer of plutonium, along with Glenn T. Seaborg, Edwin McMillan and Arthur Wahl. During World War II he was head of the CM (Chemistry and Metallurgy) Division at the Manhattan Project's Los Alamos Laboratory"

In [12]:
# Alma Mater

def extract_alma_mater(text):
    # Cases:
    # - Instead, he attended the Massachusetts Institute of Technology,
    # - Feynman received a Ph.D. from Princeton in 1942
    # - Einstein was awarded a PhD by the University of Zürich
    # - In 1903, Bohr enrolled as an undergraduate at Copenhagen University.
    # - Bethe entered the University of Frankfurt in 1924.
    # - Bethe entered the University of Munich in April 1926
    # - von Neumann also entered Pázmány Péter University in Budapest
    
    if not re.search(r'attend|enter|enroll|apply|applied|went|go|accept', text): return None
    
    doc = nlp(text)

    orgs = list(filter(lambda e: e.label_ == 'ORG', doc.ents))    
    if not orgs: return None
    
    result = []
    
    # receive Phd from ORG
    # r'Ph\.?D\.?'
    
    for o in orgs:
        # print(o.root, o.root.head, o.root.head.head)
        
        prev = o.root.head
        
        # attended, entered, enrolled ORG
        if  prev.text in ['attended', 'entered', 'enrolled']:
            result.append(o)


        if prev.dep_ == 'prep':
            verbs = ['accept', 'apply', 'go']

            if prev.head.lemma_ in verbs:
                result.append(o)
    
    return result

case1 = 'Instead, he attended the Massachusetts Institute of Technology, where he joined the Pi Lambda Phi fraternity.'
case2 = "Bethe entered the University of Munich in April 1926, where Sommerfeld took him on as a student on Meissner's recommendation."
case3 = 'In 1903, Bohr enrolled as an undergraduate at Copenhagen University.'
case4 = 'Feynman received a Ph.D. from Princeton in 1942'
case5 ='As a result, Einstein was awarded a PhD by the University of Zürich, with his dissertation A New Determination of Molecular Dimensions.'
case6 = "Fermi graduated from high school in July 1918, and at Amidei's urging applied to the Scuola Normale Superiore in Pisa."
case7 = 'At the same time, von Neumann also entered Pázmány Péter University in Budapest, as a Ph.D. candidate in mathematics.'
case8 = 'Having passed his abitur, Bethe entered the University of Frankfurt in 1924.'
case9 = 'He graduated as a chemical engineer from ETH Zurich in 1926 (although Wigner says that von Neumann was never very attached to the subject of chemistry), and passed his final examinations for his Ph.D. in mathematics simultaneously with his chemical engineering degree, of which Wigner wrote, "Evidently a Ph.D. thesis and examination did not constitute an appreciable effort."'
case10 = 'This was not something that von Neumann had much knowledge of, so it was arranged for him to take a two-year, non-degree course in chemistry at the University of Berlin, after which he sat for the entrance exam to the prestigious ETH Zurich, which he passed in September 1923'
case11 = 'He then went to the University of Göttingen on a grant from the Rockefeller Foundation to study mathematics under David Hilbert.'
case12 = 'In 1926, Oppenheimer left Cambridge for the University of Göttingen to study under Max Born.'
case13 = "In 1924, Oppenheimer was informed that he had been accepted into Christ's College, Cambridge."
case14 = 'He entered Harvard College one year after graduation, at age 18, because he suffered an attack of colitis while prospecting in Joachimstal during a family summer vacation in Europe.'

extract_alma_mater(case11)

[University of Göttingen]

In [41]:
def extract_facts_v1(lines):
    facts = {}
    alma_mater = []

    for l in lines:        
        if extract_mhp_involvement(l):
            facts['mhp_involvement'] = True
        else:
            birthplace = extract_birthplace(l)
            if birthplace: facts['birth_place'] = birthplace
            
            # if re.search(r'(Los Alamos)|(Manhattan Project(\'s)?)', l):
            #     
            #     print(l)
            #     print('=====')

    return facts

def extract_facts_v2(lines):
    facts = {}
    alma_mater = []

    for l in lines:        
        if extract_mhp_involvement(l): facts['mhp_involvement'] = True
        birthdate = extract_birthdate(l)
        if birthdate: facts['brithdate'] = birthdate
        birthplace = extract_birthplace(l)
        if birthplace: facts['birth_place'] = birthplace
        if extract_cancer_fact(l): facts['had_cancer'] = True
        if extract_nobel_prize_fact(l): facts['recieved_nobel_prize'] = True
        edu_places = extract_alma_mater(l)
        if edu_places: alma_mater += edu_places
                
    facts['edu_places'] = alma_mater
    
    return facts

In [44]:
import wikipedia
import spacy

nlp = spacy.load('en_core_web_sm')

page = wikipedia.page('John_von_Neumann')
extract_facts_v2(page.content.split('\n'))

{'mhp_involvement': True,
 'birth_place': 'Budapest',
 'had_cancer': True,
 'edu_places': [Pázmány Péter University, the University of Göttingen]}

In [None]:
import json
import re

from collections import defaultdict
from datetime import datetime

def load_corpus(f):
    with open(f) as lines:
        data = {}
        name = None

        for l in lines:
            l = l.strip()

            if not l:
                continue

            if l.startswith('####'):
                m = re.match('####\s(.+)\s####', l) 
                name = m.group(1)
                data[name] = []
            else:
                data[name].append(l)
                
        return data
    
def cmp_mlp_involvement(p1, p2):
    return p1.get('mhp_involvement', True) == p2.get('mhp_involvement', None)

def cmp_birth_place(p1, p2):
    return p2.get('birth_place', 'undefined') in p1.get('birth_place', '')

def cmp_birthdate(p1, p2):
    if p1.get('birthdate', None) and p2.get('birthdate', None):
        bd1 = datetime.strftime(p1['birthdate'][0:-10], '%Y-%m-%d')
        bd2 = datetime.strptime(p2['birthdate'], '%B %d, %Y')
        return bd1 == bd2
    else:
        return False
    
def cmp_nobel_prize_fact(p1, p2):
    recieved_nobel_prize = False

    for a in p1.get('awards', []):
        if (re.search('Nobel', a)):
            recieved_nobel_prize = True
            break
            
    return recieved_nobel_prize == p2.get('recieved_nobel_prize', False)

def calc_score(p1, p2, cmp):
    return 1 if cmp(p1, p2) else 0

def calc_avg_score(p1, p2):
    m = 0
    
    if cmp_mhp_ivolvement(p1, p2): m += 1
    if cmp_brith_place(p1, p2): m += 1
    if cmp_nobel_prize_fact(p1, p2): m += 1
    
    return m / 3

    
with open('physicists.json', 'r') as json_file:
    physicists = json.load(json_file)
    pages = load_corpus('wikipedia.txt')
    
    total = 0
    mhp = 0
    
    mhp_score = 0
    bp_score = 0
    bd_score = 0
    
    stats = defaultdict(int)
    
    for p in physicists:
        # print(total)
        page  = pages[p['name']]
              
        total += 1
        
        facts = extract_facts_v2(page)
        
        # TODO: Add cancer fact check
        # TODO: Add birthdate fact check
        # TODO: Add alma mater facts check

        for k, fn in [('mhp', cmp_mlp_involvement),
                      ('birth place', cmp_birth_place),
                      ('nobel prize', cmp_nobel_prize_fact)]:
            stats[k] += calc_score(p, facts, fn)

        # nobel_score += calc_score(p, facts, cmp_nobel_prize_fact)
        
        # print(facts)

for label, score in stats.items():
    print(label + ': ' + str(score / total))

In [None]:
import nltk

sentence = 'Louis Alexander Slotin (1 December 1910 – 30 May 1946) was a Canadian physicist and chemist who took part in the Manhattan Project.'

grammar = ('''
    VB: {<VBD><NN>*<IN>} # NP
    ''')

chunkParser = nltk.RegexpParser(grammar)
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
tree = chunkParser.parse(tagged)
tagged