In [1]:
'''Tokenization is always the first step before we can do any text data processing.
What this means is that spaCy will segment sentences into words, punctuations, 
symbols and others by applying specific rules to each language.'''
#python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [2]:
'''Once we have tokenized words, 
the typical next step would be to perform Part-of-speech (POS)
tagging to understand the grammatical properties (noun, verb, adjective etc.) of each word.'''
import spacy
nlp= spacy.load("en_core_web_sm")
doc= nlp("She ate the pizza")
for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [3]:
'''Named Entity Recognition
Named entities are “real world objects” that are assigned a name — for example, a person, an organization or a country.'''
import spacy
nlp= spacy.load("en_core_web_sm")
doc= nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [4]:
'''Similarity By default, the similarity returned by spaCy is the cosine similarity between two vectors'''
import spacy
# Load a larger model with vectors
nlp = spacy.load('en_core_web_md')
# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627203210548107


In [1]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

ConnectionError: HTTPSConnectionPool(host='www.nytimes.com', port=443): Max retries exceeded with url: /2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000192FA8197B8>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))