In [1]:
import pandas as pd
import spacy 
from spacy import displacy

In [2]:
# spaCy v3.0 features all new transformer-based pipelines that bring spaCy’s accuracy right up to the current state-of-the-art. 
spacy.__version__

'3.3.1'

## SpaCy

#### SpaCy Models:  
- en_core_web_sm: English multi-task CNN trained on OntoNotes. Size – 11 MB
- en_core_web_md: English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Size – 91 MB
- en_core_web_lg: English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Size – 789 MB
- en_core_web_trf: English transformer pipeline (roberta-base). Components: transformer, tagger, parser, ner, attribute_ruler, lemmatizer.  Size - 438 MB
- **en_core_web_trf** is available in SpaCy 3.X only

In [3]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_trf

In [4]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

In [5]:
# Checking active pipeline components
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### Spacy for POS Tagging

In [6]:
text = '''Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity.'''

doc = nlp(text)

In [7]:
# Content of SpaCy nlp object
doc.text

"Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity."

In [8]:
doc = nlp(text)


words = []
pos_tags = []
pos = []
lemma = []
dep = []
shape = []
alpha = []
stopword = []
numeric = []


for token in doc:
    words.append(token.text)
    lemma.append(token.lemma_)
    pos.append(token.pos_)
    pos_tags.append(token.tag_)
    dep.append(token.dep_)
    shape.append(token.shape_)
    alpha.append(token.is_alpha)
    numeric.append(token.like_num)
    stopword.append(token.is_stop)


df = pd.DataFrame({'Words':words, 'Lemma':lemma, 'POS_Tags':pos_tags, 'POS':pos, 'Dependency':dep,
                   'Shape':shape, 'Alpha':alpha, 'Numeric':numeric, 'Stopword':stopword})

df

Unnamed: 0,Words,Lemma,POS_Tags,POS,Dependency,Shape,Alpha,Numeric,Stopword
0,Surging,surge,VBG,VERB,amod,Xxxxx,True,False,False
1,Chinese,chinese,JJ,ADJ,amod,Xxxxx,True,False,False
2,demand,demand,NN,NOUN,nsubj,xxxx,True,False,False
3,and,and,CC,CCONJ,cc,xxx,True,False,True
4,an,an,DT,DET,det,xx,True,False,True
...,...,...,...,...,...,...,...,...,...
153,for,for,IN,ADP,prep,xxx,True,False,True
154,overall,overall,JJ,ADJ,amod,xxxx,True,False,False
155,economic,economic,JJ,ADJ,amod,xxxx,True,False,False
156,activity,activity,NN,NOUN,pobj,xxxx,True,False,False


In [9]:
spacy.explain("VBG")

'verb, gerund or present participle'

#### Using POS tagging to identify numbers and context

In [10]:
text = '''In the latest trading session, Caterpillar (CAT) closed at $113.22, marking a +1.52% move from the previous day. This change outpaced the S&P 500's 0.58% gain on the day. At the same time, the Dow added 0.14%, and the tech-heavy Nasdaq gained 1.66%.
Heading into today, shares of the construction equipment company had gained 11.4% over the past month, outpacing the Industrial Products sector's loss of 2.53% and the S&P 500's gain of 3.15% in that time.'''

doc = nlp(text)

In [11]:
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        token_prior_2 = doc[token.i - 2]
        token_prior_1 = doc[token.i - 1]
        token_next_1 = doc[token.i + 1]
        token_next_2 = doc[token.i + 2]
        if (token_next_1.text == "%" or token_next_2.text == "%"):
            print("Percentage:::", token_prior_2, token_prior_1, token.text, token_next_1, token_next_2)
        else:
            print("Numbers:::", token_prior_2, token_prior_1, token.text, token_next_1, token_next_2)

Numbers::: at $ 113.22 , marking
Percentage::: marking a +1.52 % move
Numbers::: the S&P 500 's 0.58
Percentage::: 500 's 0.58 % gain
Percentage::: Dow added 0.14 % ,
Percentage::: Nasdaq gained 1.66 % .
Percentage::: had gained 11.4 % over
Percentage::: loss of 2.53 % and
Numbers::: the S&P 500 's gain
Percentage::: gain of 3.15 % in


### NER with SpaCy

In [12]:
text = '''Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity.'''

In [13]:
print(text)

Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity.


In [14]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df.head(20)

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,Chinese,NORP,8,15
1,U.S.,GPE,40,44
2,Caterpillar,ORG,74,85
3,Latin America,LOC,182,195
4,Europe,LOC,200,206
5,2018,DATE,254,258
6,Caterpillar,ORG,302,313
7,this year,DATE,510,519
8,This week,DATE,521,530
9,the International Monetary Fund,ORG,532,563


In [15]:
spacy.explain("FAC")

'Buildings, airports, highways, bridges, etc.'

In [16]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [17]:
spacy.explain("GPE")

'Countries, cities, states'

In [18]:
df[df.Labels == 'ORG']

Unnamed: 0,Entities,Labels,Position_Start,Position_End
2,Caterpillar,ORG,74,85
6,Caterpillar,ORG,302,313
9,the International Monetary Fund,ORG,532,563
12,Caterpillar,ORG,639,650


In [19]:
df[df.Labels == 'PERSON']

Unnamed: 0,Entities,Labels,Position_Start,Position_End
14,Larry De Maria,PERSON,806,820
15,William Blair & Co.,PERSON,836,855


In [20]:
df[df.Labels == 'GPE']

Unnamed: 0,Entities,Labels,Position_Start,Position_End
1,U.S.,GPE,40,44


#### Using Spacy Small

In [21]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

In [22]:
text = '''Sara's work efforts destroyed Apple Corporation's annual sales single handedly'''

In [23]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

print(text)
display(df.head(20))

Sara's work efforts destroyed Apple Corporation's annual sales single handedly


Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,Apple Corporation's,ORG,30,49
1,annual,DATE,50,56


In [24]:
text = '''The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet'''

In [25]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

print(text)
display(df.head(20))

The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet


Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,The NBC Tower,ORG,0,13
1,the Near North Side,FAC,36,55
2,Chicago,GPE,59,66
3,Illinois,GPE,68,76
4,454,CARDINAL,88,91
5,North Columbus Drive,LOC,92,112
6,Chicago,GPE,125,132
7,Magnificent Mile,LOC,135,151
8,1989,DATE,171,175
9,37,CARDINAL,181,183


#### Using Spacy Medium

In [26]:
# Load SpaCy model
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

In [27]:
text = '''Sara's work efforts destroyed Apple Corporation's annual sales single handedly'''

In [28]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

print(text)
display(df.head(20))

Sara's work efforts destroyed Apple Corporation's annual sales single handedly


Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,Sara,GPE,0,4
1,Apple Corporation's,ORG,30,49
2,annual,DATE,50,56


In [29]:
text = '''The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet'''

In [30]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

print(text)
display(df.head(20))

The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet


Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,The NBC Tower,ORG,0,13
1,the Near North Side,LOC,36,55
2,Chicago,GPE,59,66
3,Illinois,GPE,68,76
4,454,CARDINAL,88,91
5,Chicago,GPE,125,132
6,Magnificent Mile,EVENT,135,151
7,1989,DATE,171,175
8,37,CARDINAL,181,183
9,627 feet,QUANTITY,219,227


#### Using Spacy Large

In [31]:
# Load SpaCy model
# nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

In [32]:
text = '''Sara's work efforts destroyed Apple Corporation's annual sales single handedly'''

In [33]:
print(text)

Sara's work efforts destroyed Apple Corporation's annual sales single handedly


In [34]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df.head(20)

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,Sara,ORG,0,4
1,Apple Corporation's,ORG,30,49
2,annual,DATE,50,56


In [35]:
text = '''The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet'''

In [36]:
print(text)

The NBC Tower is an office tower on the Near North Side of Chicago, Illinois located at 454 North Columbus Drive in downtown Chicago's Magnificent Mile area. Completed in 1989, the 37-story building reaches a height of 627 feet


In [37]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df.head(20)

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,The NBC Tower,ORG,0,13
1,the Near North Side,LOC,36,55
2,Chicago,GPE,59,66
3,Illinois,GPE,68,76
4,454,CARDINAL,88,91
5,North Columbus Drive,LOC,92,112
6,Chicago,GPE,125,132
7,Magnificent Mile,PRODUCT,135,151
8,1989,DATE,171,175
9,37,CARDINAL,181,183


### Vizualizing SpaCy

In [38]:
displacy.render(doc, jupyter = True, style = "ent")

In [39]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sat, 29 October 2022 11:03:00'