In [31]:
import spacy
from spacy import displacy
from collections import Counter
from bs4 import BeautifulSoup
import requests
import re

# Named Entity Recognition

In [32]:
nlp = spacy.load('en_core_web_sm')

In [33]:
with open('mobydick_shortened.txt', 'r') as f:
    moby_dick_txt = f.read()

In [34]:
doc = nlp(moby_dick_txt)

In [4]:
labels = [x.label_ for x in doc.ents]

for a in Counter(labels).most_common():
    print(a)

('PERSON', 2045)
('CARDINAL', 1015)
('ORG', 899)
('GPE', 611)
('NORP', 556)
('DATE', 344)
('ORDINAL', 259)
('LOC', 214)
('TIME', 201)
('QUANTITY', 140)
('WORK_OF_ART', 91)
('PRODUCT', 75)
('FAC', 69)
('LANGUAGE', 31)
('MONEY', 12)
('LAW', 10)
('EVENT', 9)


In [5]:
sentences_mbd = [x for x in doc.sents]

In [6]:
displacy.render(doc, jupyter=True, style='ent')

In [7]:
# visualizing dependency parse if we need it
displacy.render(nlp(str(sentences_mbd[6])), style='dep', jupyter = True, options = {'distance': 120})

In [37]:
article_text = ""
html = requests.get('https://www.nytimes.com/2019/06/03/technology/facebook-ftc-antitrust.html').text
soup = BeautifulSoup(html, 'html5lib')

for script in soup(["script", "style", 'aside']):
        script.extract()
article_text += " ".join(re.split(r'[\n\t]+', soup.get_text())) 

nyt_article = nlp(article_text)
len(nyt_article.ents)

174

In [9]:
labels = [x.label_ for x in nyt_article.ents]

for a in Counter(labels).most_common():
    print(a)

('ORG', 77)
('PERSON', 30)
('DATE', 24)
('CARDINAL', 18)
('GPE', 11)
('NORP', 9)
('PERCENT', 5)
('TIME', 5)
('ORDINAL', 4)
('LOC', 3)
('PRODUCT', 2)
('WORK_OF_ART', 1)
('MONEY', 1)


In [10]:
sentences_nyt = [x for x in nyt_article.sents]

In [11]:
displacy.render(nyt_article, jupyter=True, style='ent')

# Coreference resolution

In [35]:
nlp = spacy.load('en_core_web_lg')

#add neural coref to spacy pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f08fada79e8>

In [38]:
#doc = nlp('My sister has a dog. She loves him.')

def printMentions(doc):
    print('\nAll the "mentions" in the given text:')
    for cluster in doc._.coref_clusters:
        print(cluster.mentions)

def printPronounReferencesNET(doc):
    print('\nPronouns and their references:')
    for token in doc:
        if token.pos_ == 'PRON' and token._.in_coref and cluster.main.text in nyt_article.ent:
            for cluster in token._.coref_clusters:
                print(token.text + " => " + cluster.main.text)

def processDocNET(text):
    doc = nlp(text)
    if doc._.has_coref:
        print("Given text: " + text)
        printMentions(doc)
        printPronounReferences(doc)      

processDocNET(article_text)

Given text:      Antitrust Troubles Snowball for Tech Giants as Lawmakers Join In - The New York Times                                                                       SectionsSEARCHSkip to contentSkip to site indexTechnologySubscribeLog InLog InToday’s PaperTechnology|Antitrust Troubles Snowball for Tech Giants as Lawmakers Join InAdvertisementSupported byAntitrust Troubles Snowball for Tech Giants as Lawmakers Join InImageThe Justice Department will oversee antitrust complaints about Apple while the Federal Trade Commission will take on Facebook.CreditCreditTom Brenner for The New York TimesBy Cecilia Kang, David Streitfeld and Annie KarniJune 3, 2019WASHINGTON — The federal government is stepping up its scrutiny of the world’s biggest tech companies, leaving them vulnerable to new rules and federal lawsuits. Regulators are divvying up antitrust oversight of the Silicon Valley giants and lawmakers are investigating whether they have stifled competition and hurt consumers.After a

In [41]:
def printPronounReferences(doc):
    print('\nPronouns and their references:')
    for token in doc:
        if token.pos_ == 'PRON' and token._.in_coref:
            for cluster in token._.coref_clusters:
                print(token.text + " => " + cluster.main.text)

def processDoc(text):
    doc = nlp(text)
    if doc._.has_coref:
        print("Given text: " + text)
        printMentions(doc)
        printPronounReferences(doc) 

#change 'the water' to 'it' to show weaknesses
processDoc('“Kaylee went to grab the water to stop the fire Pa had started. She held the pail tightly in hopes that the slick handle would not cause it to slip from her grasp. She ran to the well, only stopping to catch her breath along the way. It was a long run. She reached the well and dunked the bucket into the water.”')

Given text: Trees are one of our most important natural resources. They are made of wood, and wood can be made into a variety of products. Some of the more obvious kinds are furniture, houses, and toothpicks. However, wood can also be made into paper. When I first heard this, I was skeptical, but it is true. Paper is a very important product in our society. Writers and artists have greatly benefited from the invention of paper. With only some paper and a pen or pencil, a writer can produce stories and poems that can captivate readers. They can also write down historical facts about their society. Actually, these writings don’t become historical until years later. At the time, the writings could probably be considered news. Artists use paper for their drawings and paintings. They can also use canvas. Drawings and paintings can be very beautiful. They can depict a wide variety of subjects, including flowers, animals, landscapes, and people. They can be realistic or impressionistic. Some 