In [9]:
from tabulate import tabulate
import spacy
nlp = spacy.load("en_core_web_sm")

In [10]:
doc = nlp('Leonard Simon Nimoy was born on March 26, 1931, in an Irish section \
of West End of Boston, Massachusetts, to Jewish immigrants from Iziaslav, Ukraine. \
His mother, Dora (née Spinner; 1904–1987), was a homemaker, and his father, \
Max Nimoy (1901–1987), owned a barbershop in the Mattapan section of Boston. \
Leonard Simon Nimoy was an American actor, famed for playing Spock in the Star Trek \
franchise for almost 50 years.')

# Let's view the first 10 tokens
for token in doc[:10]:
  print(token.text)

Leonard
Simon
Nimoy
was
born
on
March
26
,
1931


In [11]:
# doc.sents - an iterator over the sentences in the Doc object
for id, sent in enumerate(doc.sents):
  print(f'Sentence {id+1}: {sent}')

Sentence 1: Leonard Simon Nimoy was born on March 26, 1931, in an Irish section of West End of Boston, Massachusetts, to Jewish immigrants from Iziaslav, Ukraine.
Sentence 2: His mother, Dora (née Spinner; 1904–1987), was a homemaker, and his father, Max Nimoy (1901–1987), owned a barbershop in the Mattapan section of Boston.
Sentence 3: Leonard Simon Nimoy was an American actor, famed for playing Spock in the Star Trek franchise for almost 50 years.


In [12]:
token_details = []
for idx, token in enumerate(doc):
  token_details.append((idx, token.text, token.lemma_, token.pos_, token.tag_, token.dep_))

In [13]:
# token.lemma_ - the base form of the word. Example: the lemma of was is be.
# token.pos_ - simple part-of-speech tag according to the UPOS.
# token.tag_ - detailed part-of-speech tag according to the Penn.
# token.dep_ - syntactic dependency to describe the relationship between phrases in that sentence.
print(tabulate(token_details[:25], headers=['ID', 'TEXT', 'LEMMA', 'POS', 'TAG', 'DEP']))

  ID  TEXT           LEMMA          POS    TAG    DEP
----  -------------  -------------  -----  -----  ---------
   0  Leonard        Leonard        PROPN  NNP    compound
   1  Simon          Simon          PROPN  NNP    compound
   2  Nimoy          Nimoy          PROPN  NNP    nsubjpass
   3  was            be             AUX    VBD    auxpass
   4  born           bear           VERB   VBN    ROOT
   5  on             on             ADP    IN     prep
   6  March          March          PROPN  NNP    pobj
   7  26             26             NUM    CD     nummod
   8  ,              ,              PUNCT  ,      punct
   9  1931           1931           NUM    CD     nummod
  10  ,              ,              PUNCT  ,      punct
  11  in             in             ADP    IN     prep
  12  an             an             DET    DT     det
  13  Irish          irish          ADJ    JJ     amod
  14  section        section        NOUN   NN     pobj
  15  of             of             ADP 

In [15]:
from spacy import displacy

for sent in doc.sents:
  displacy.render(sent, style="dep", jupyter=True, options={'distance': 100})

In [16]:
# ent.text
# ent.start_char - the position of the first character of the name mentioned in the sentence, with 0 being the first character in the sentence.
# ent.end_char - the position of the last character of the name mentioned in the sentence.
# ent.label_ - the category, such as, ORG, GPE (Geopolitical Entity), MONEY, etc.
ner_details = []
 
for ent in doc.ents:
  ner_details.append((ent.text, ent.start_char, ent.end_char, ent.label_))

In [17]:
import pandas as pd

# for now, just for printing tabular data nicely ;)
pd.DataFrame(ner_details, columns=['TEXT', 'START', 'END', 'LABEL'])

Unnamed: 0,TEXT,START,END,LABEL
0,Leonard Simon Nimoy,0,19,PERSON
1,"March 26, 1931",32,46,DATE
2,Irish,54,59,NORP
3,West End,71,79,GPE
4,Boston,83,89,GPE
5,Massachusetts,91,104,GPE
6,Jewish,109,115,NORP
7,Iziaslav,132,140,GPE
8,Ukraine,142,149,GPE
9,Dora,163,167,PERSON


In [18]:
from spacy import displacy

# Let's use displacy to display the entities.
displacy.render(doc, style='ent', jupyter=True)

In [20]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'