# 1. Install, Imports, and Settings

In [1]:
import json
import pandas as pd

#sentence segmentation based on https://arxiv.org/pdf/2011.07868.pdf
import spacy_udpipe
spacy_udpipe.download("en") # download English model
nlp = spacy_udpipe.load("en")

import pprint
pp = pprint.PrettyPrinter(indent=4)

Already downloaded a model for the 'en' language


In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

# 2. Data and Functions

In [6]:
persons_professions = {}
bag_of_professions = set([])
non_identified = []

def get_target_expression(first_sentence):
    verbs = ['is', 'is', 'was', 'was', 'served as']
    articles = ['a', 'an', 'the'] 
    for verb in verbs:
        for art in articles:
            exp = ' '.join([verb, art])
            if exp in first_sentence:
                target = first_sentence.split(exp)[1]
                return target
    return "profession-non-detected"
            
def get_professions(target):
    
def extract_professions(file):
    # Opening JSON file
    f = open(file)
    data = json.load(f)
    for person in data:
        doc = nlp(person['Content'])
        first_sentence = str(next(doc.sents))
        target = get_target_expression(first_sentence)

        #print(target)
        if target == 'profession-non-detected':
            non_identified.append(person['title'])
        else 
            bag_of_professions += get_professions(target)
            
    f.close()

# 3. Load chunk of data (example)

In [14]:
 # Opening JSON file
f = open("data/wiki-Living-Persons/wiki-Living-People-p0.json")
data = json.load(f)
pp.pprint(data[4])
f.close()

{   'Content': 'Susanne E. Aalto (born 28 November 1964) is a Swedish '
               'professor of radio astronomy geodesy at the Onsala Space '
               'Observatory in the department of Space, Earth and Environment '
               'at Chalmers University of Technology. She has been a professor '
               'of radio astronomy since 2013. Between 1994 and 1999, she was '
               'a postdoc at the Steward Observatory, University of Arizona '
               'and at Caltech in the United States of America. In 1999, she '
               'was awarded the Albert Wallin Prize by the Royal Society for '
               'Science and Knowledge in Gothenburg. She researches galaxy '
               'evolution and motion using radio telescopes and radiation from '
               'molecules.\n'
               '\n'
               '\n'
               '== Early life and education ==\n'
               'She was born on 28 November 1964 in Eskilstuna, Sweden. In '
               "1994,

# 4. First Iteration - 

In [38]:
#text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world."
text = "This is a long string with some numbers 123.456,78 or 100.000 and e.g. some abbreviations in it, which shouldn\'t split the sentence. Sometimes there are problems, i.e. in this one. here and abbr at the end x.y.. cool."


doc = nlp(text)
sentences = [sent for sent in doc.sents]

i=0
while i <len(sentences):
    print(sentences[i])
    print()
    i = i +1

This is a long string with some numbers 123.456,78 or 100.000 and e.g. some abbreviations in it, which shouldn't split the sentence.

Sometimes there are problems, i.e. in this one.

here and abbr at the end x.y.. cool.



In [40]:
#text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world."
text = "Susanne E. Aalto (born 28 November 1964) is a Swedish professor of radio astronomy geodesy at the Onsala Space Observatory in the department of Space, Earth and Environment at Chalmers University of Technology. She has been a professor of radio astronomy since 2013. Between 1994 and 1999, she was a postdoc at the Steward Observatory, University of Arizona and at Caltech in the United States of America. In 1999, she was awarded the Albert Wallin Prize by the Royal Society for Science and Knowledge in Gothenburg. She researches galaxy evolution and motion using radio telescopes and radiation from molecules."

doc = nlp(text)
sentences = [sent for sent in doc.sents]

i=0
while i <len(sentences):
    print(sentences[i])
    print()
    i = i +1

Susanne E. Aalto (born 28 November 1964) is a Swedish professor of radio astronomy geodesy at the Onsala Space Observatory in the department of Space, Earth and Environment at Chalmers University of Technology.

She has been a professor of radio astronomy since 2013.

Between 1994 and 1999, she was a postdoc at the Steward Observatory, University of Arizona and at Caltech in the United States of America.

In 1999, she was awarded the Albert Wallin Prize by the Royal Society for Science and Knowledge in Gothenburg.

She researches galaxy evolution and motion using radio telescopes and radiation from molecules.



In [45]:
#text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world."
text = "Susanne E. Aalto (born 28 November 1964) is a Swedish professor of radio astronomy geodesy at the Onsala Space Observatory in the department of Space, Earth and Environment at Chalmers University of Technology. She has been a professor of radio astronomy since 2013. Between 1994 and 1999, she was a postdoc at the Steward Observatory, University of Arizona and at Caltech in the United States of America. In 1999, she was awarded the Albert Wallin Prize by the Royal Society for Science and Knowledge in Gothenburg. She researches galaxy evolution and motion using radio telescopes and radiation from molecules."

doc = nlp(text)
print(next(doc.sents)) #get first sentence


Susanne E. Aalto (born 28 November 1964) is a Swedish professor of radio astronomy geodesy at the Onsala Space Observatory in the department of Space, Earth and Environment at Chalmers University of Technology.


In [42]:
doc.sents

<generator at 0x7f96de445b80>

In [28]:
!pip install spacy-udpipe

zsh:1: command not found: pip


In [2]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

HBox(children=(HTML(value='Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/res…




2022-05-25 05:47:52 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-05-25 05:47:52 INFO: Use device: cpu
2022-05-25 05:47:52 INFO: Loading: tokenize
2022-05-25 05:47:52 INFO: Loading: ner
2022-05-25 05:47:53 INFO: Done loading processors!


entity: Chris E. Manning	type: PERSON
entity: Stanford University	type: ORG
entity: the Bay Area	type: LOC


In [8]:
doc = nlp("Chris E. Manning is a the president")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Chris E. Manning	type: PERSON


In [15]:
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('stanford-corenlp-4.4.0/', lang='en', memory='8g')

AccessDenied: psutil.AccessDenied (pid=94265)

In [None]:
# The sentence you want to parse
sentence = 'I eat a big and red apple.'

# POS
print('POS：', nlp.pos_tag(sentence))

# Tokenize
print('Tokenize：', nlp.word_tokenize(sentence))

# NER
print('NER：', nlp.ner(sentence))

# Parser
print('Parser：')
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))

# Close Stanford Parser
nlp.close()