In [34]:
import spacy
from spacy import displacy
from IPython.display import display

In [35]:
nlp = spacy.blank("en")
doc = nlp("Wonder Woman ate a bunch of doughnuts, hah! Good for her, Captain Marvel remarked.")
for token in doc:
    print(token)

Wonder
Woman
ate
a
bunch
of
doughnuts
,
hah
!
Good
for
her
,
Captain
Marvel
remarked
.


In [36]:
nlp = spacy.load("en_core_web_sm")

In [37]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [38]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x10fc96d50>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x10fc972f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x110e649e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x10fef7490>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x10e76f950>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x110e64900>)]

### Parts of Language Processing Pipeline

Pipeline consists of:
1. Tagger
2. Parser
3. NER
and other components...

In [54]:
## Lemmatize
doc = nlp("Wonder Woman ate a bunch of doughnuts, hah! Good for her, Captain Marvel remarked.")
for token in doc:
    print(token, " | ", token.pos_, " | " , token.lemma_)

Wonder  |  PROPN  |  Wonder
Woman  |  PROPN  |  Woman
ate  |  VERB  |  eat
a  |  DET  |  a
bunch  |  NOUN  |  bunch
of  |  ADP  |  of
doughnuts  |  NOUN  |  doughnut
,  |  PUNCT  |  ,
hah  |  INTJ  |  hah
!  |  PUNCT  |  !
Good  |  ADJ  |  good
for  |  ADP  |  for
her  |  PRON  |  she
,  |  PUNCT  |  ,
Captain  |  PROPN  |  Captain
Marvel  |  PROPN  |  Marvel
remarked  |  VERB  |  remark
.  |  PUNCT  |  .


In [55]:
##NER
doc = nlp("Microsoft's Satya Nadella is looking to acquire OpenAI for $10 billion.")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Microsoft  |  ORG  |  Companies, agencies, institutions, etc.
Satya Nadella  |  PERSON  |  People, including fictional
OpenAI  |  ORG  |  Companies, agencies, institutions, etc.
$10 billion  |  MONEY  |  Monetary values, including unit


In [60]:
doc = nlp("Mr.Bloomberg founded Bloomberg Inc")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Bloomberg  |  PERSON  |  People, including fictional
Bloomberg Inc  |  ORG  |  Companies, agencies, institutions, etc.


In [58]:
## Create a custom pipeline without using pre-existing one
source_nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.add_pipe("ner", source = source_nlp)
nlp.pipe_names

['ner']

In [59]:
doc = nlp("Mr.Bloomberg founded Bloomberg Inc")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Bloomberg  |  PERSON  |  People, including fictional
Bloomberg Inc  |  ORG  |  Companies, agencies, institutions, etc.
