In [4]:
#spacy is a NLP library similar to NLTK, but the difference is spacy is OOP tool, 
# while nltk is proper string processing library
# spacy is most efficient, nltk provides customization and many algos

In [5]:
 import spacy

In [6]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("Hii. I'm Dr. Yuvraj Singh.")
for sentence in doc.sents:
    print(sentence)

#printing sentences, sentence tokenization

Hii.
I'm Dr. Yuvraj Singh.


In [7]:
for sentence in doc.sents:
    for word in sentence:
        print(word)
        
#printing word from sentence, word tokenization

Hii
.
I
'm
Dr.
Yuvraj
Singh
.


In [8]:
import nltk

In [9]:
from nltk.tokenize import sent_tokenize
sent_tokenize("Hii. I'm Dr. Yuvraj Singh.")

['Hii.', "I'm Dr. Yuvraj Singh."]

In [10]:
type(nlp)

spacy.lang.en.English

In [11]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [12]:
doc=nlp(text)

In [13]:
urls=[]
for token in doc:
    if token.like_url:
        urls.append(token.text)
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [14]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc=nlp(transactions)
extracted_transactions = []

for token in doc:
    if token.like_num:
        next_token = token.nbor(1)
        if next_token and next_token.is_currency:
            extracted_transactions.append(f"{token} {next_token}")

print(extracted_transactions)

['two $', '500 €']


In [15]:
nlp=spacy.blank('en')
doc=nlp("Captin america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token)

Captin
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [16]:
nlp.pipe_names  #pipelines are pre defined, or we need to create those
# for example 'en','hi','en_core_web_sm' etc are pre defined pipelines..

[]

In [17]:
# let's use en_core_web_sm this one
nlp=spacy.load('en_core_web_sm')
nlp.pipe_names

#it has this this components..

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [18]:
doc=nlp("Captin america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token," | ",token.pos_," | ",token.lemma_)
    
#lemma gives u the base word, for ex: ate is a past tense, base one is eat, had -> have..

Captin  |  PROPN  |  Captin
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


In [19]:
#use of component ner, it basically recognizing the entities..
doc=nlp("Tesla is going to Acquire Twitter for whopping amount of 90 billions")
for ent in doc.ents:
    print(ent.text,"->",ent.label_,'->',spacy.explain(ent.label_))

Tesla -> ORG -> Companies, agencies, institutions, etc.
Acquire Twitter -> PERSON -> People, including fictional
90 billions -> MONEY -> Monetary values, including unit


In [20]:
from spacy import displacy
displacy.render(doc,style='ent')

#can even highlight the entities..

In [21]:
doc=nlp("Tommy Hilfiger founded his Company Tommy Hilfiger")
for ent in doc.ents:
    print(ent.text,"->",ent.label_,'->',spacy.explain(ent.label_))
displacy.render(doc,style='ent')

Tommy Hilfiger -> PERSON -> People, including fictional
Tommy Hilfiger -> PERSON -> People, including fictional


In [22]:
#Stemming and lemmatization in Spacy,
# here, stemming is not supported by spacy because lemmatization is more sophisticated and complex, so they wrapped up 
# all in lemmatization, while nltk has both components..
# in NLP, accuracy(lemmatization) >> acc(stemming), so spacy simply neglected less accurate component

In [23]:
import nltk
import spacy
from nltk.stem import PorterStemmer

In [24]:
stemmer=PorterStemmer()
words=['eating','reading','curable','ate','meeting','ability','durable','saw']
for word in words:
    print(word,"->",stemmer.stem(word))
    
#see, it is so less accurate, can't even recognize for 'ate', just using fixed rules..
#still some ppl use this coz of speed, no pre req. knowledge requirement, works on bunch of rules, can add value to NLP

eating -> eat
reading -> read
curable -> curabl
ate -> ate
meeting -> meet
ability -> abil
durable -> durabl
saw -> saw


In [25]:
nlp=spacy.load('en_core_web_sm')
doc=nlp("eating eats reading curable ate meeting ability durable saw came had")
for token in doc:
    print(token,"->",token.lemma_)

eating -> eat
eats -> eat
reading -> read
curable -> curable
ate -> ate
meeting -> meeting
ability -> ability
durable -> durable
saw -> saw
came -> come
had -> have


In [26]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
ar=nlp.get_pipe("attribute_ruler")
ar.add([[{'TEXT':"Bro"}],[{'TEXT':"Bruh"}],[{'TEXT':"Brahh"}]],{"LEMMA":"Brother"}) #adding rule..
doc=nlp("Bro Bruh Duhh Brother Brahh")
for token in doc:
    print(token,"->",token.lemma_)
    
#here bro, brahh, bruhh all are same, but this ,odel wont get it, still we can customize it as per our need

Bro -> Brother
Bruh -> Brother
Duhh -> Duhh
Brother -> Brother
Brahh -> Brother


In [28]:
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
for word in lst_words:
    print(word,'->',stemmer.stem(word))

running -> run
painting -> paint
walking -> walk
dressing -> dress
likely -> like
children -> children
whom -> whom
good -> good
ate -> ate
fishing -> fish


In [29]:
doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc:
    print(token,'->',token.lemma_)

running -> run
painting -> paint
walking -> walk
dressing -> dress
likely -> likely
children -> child
who -> who
good -> good
ate -> eat
fishing -> fishing


In [30]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""
for word in lst_words:
    print(word)

running
painting
walking
dressing
likely
children
whom
good
ate
fishing


In [31]:
#Part of Speech in Spacy .pos_

doc=nlp("Elon flew to mars yesterday, he was so quick and also carried vadapav with him")
for token in doc:
    print(token,'->',token.pos_,'->',spacy.explain(token.pos_))
    
#with proper Exlanation..

Elon -> PROPN -> proper noun
flew -> VERB -> verb
to -> ADP -> adposition
mars -> NOUN -> noun
yesterday -> NOUN -> noun
, -> PUNCT -> punctuation
he -> PRON -> pronoun
was -> AUX -> auxiliary
so -> ADV -> adverb
quick -> ADJ -> adjective
and -> CCONJ -> coordinating conjunction
also -> ADV -> adverb
carried -> VERB -> verb
vadapav -> NOUN -> noun
with -> ADP -> adposition
him -> PRON -> pronoun


In [32]:
doc=nlp('''SEATTLE--(BUSINESS WIRE)-- Amazon.com, Inc. (NASDAQ: AMZN) today announced financial results for its second quarter ended June 30, 2023.

Net sales increased 11% to $134.4 billion in the second quarter, compared with $121.2 billion in second quarter 2022. Excluding the $0.3 billion unfavorable impact from year-over-year changes in foreign exchange rates throughout the quarter, net sales increased 11% compared with second quarter 2022.
North America segment sales increased 11% year-over-year to $82.5 billion.
International segment sales increased 10% year-over-year to $29.7 billion.
AWS segment sales increased 12% year-over-year to $22.1 billion.
Operating income increased to $7.7 billion in the second quarter, compared with $3.3 billion in second quarter 2022.
North America segment operating income was $3.2 billion, compared with an operating loss of $0.6 billion in second quarter 2022.
International segment operating loss was $0.9 billion, compared with an operating loss of $1.8 billion in second quarter 2022.
AWS segment operating income was $5.4 billion, compared with operating income of $5.7 billion in second quarter 2022.
Net income was $6.7 billion in the second quarter, or $0.65 per diluted share, compared with a net loss of $2.0 billion, or $0.20 per diluted share, in second quarter 2022.
Second quarter 2023 net income includes a pre-tax valuation gain of $0.2 billion included in non-operating expense from the common stock investment in Rivian Automotive, Inc., compared to a pre-tax valuation loss of $3.9 billion from the investment in second quarter 2022.''')

In [33]:
for token in doc:
    print(token,'->',token.pos_,'->',spacy.explain(token.pos_))

SEATTLE--(BUSINESS -> PROPN -> proper noun
WIRE)-- -> PROPN -> proper noun
Amazon.com -> PROPN -> proper noun
, -> PUNCT -> punctuation
Inc. -> PROPN -> proper noun
( -> PUNCT -> punctuation
NASDAQ -> PROPN -> proper noun
: -> PUNCT -> punctuation
AMZN -> PROPN -> proper noun
) -> PUNCT -> punctuation
today -> NOUN -> noun
announced -> VERB -> verb
financial -> ADJ -> adjective
results -> NOUN -> noun
for -> ADP -> adposition
its -> PRON -> pronoun
second -> ADJ -> adjective
quarter -> NOUN -> noun
ended -> VERB -> verb
June -> PROPN -> proper noun
30 -> NUM -> numeral
, -> PUNCT -> punctuation
2023 -> NUM -> numeral
. -> PUNCT -> punctuation


 -> SPACE -> space
Net -> ADJ -> adjective
sales -> NOUN -> noun
increased -> VERB -> verb
11 -> NUM -> numeral
% -> NOUN -> noun
to -> ADP -> adposition
$ -> SYM -> symbol
134.4 -> NUM -> numeral
billion -> NUM -> numeral
in -> ADP -> adposition
the -> DET -> determiner
second -> ADJ -> adjective
quarter -> NOUN -> noun
, -> PUNCT -> punctuatio

In [34]:
#in order to count how many noun, pronoun etc are present in doc..
c=doc.count_by(spacy.attrs.POS)
c

{96: 16,
 97: 37,
 92: 69,
 100: 21,
 84: 35,
 85: 43,
 95: 1,
 93: 54,
 103: 10,
 99: 20,
 90: 12,
 87: 4,
 89: 2}

In [35]:
for k,v in c.items():
    print(doc.vocab[k].text,'->',v)  #counts..

PROPN -> 16
PUNCT -> 37
NOUN -> 69
VERB -> 21
ADJ -> 35
ADP -> 43
PRON -> 1
NUM -> 54
SPACE -> 10
SYM -> 20
DET -> 12
AUX -> 4
CCONJ -> 2


In [36]:
doc=nlp('Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and ene')

In [37]:
print(doc)

Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.

The consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.

Removing volatile food and ene


In [38]:
noun=[]
for token in doc:
    if token.pos_ in ["NOUN"]:
        noun.append(token)
#         print(token,'->',token.pos_)
noun

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 estimate,
 %,
 gain,
 ease,
 Marchâ€,
 ™,
 peak,
 level,
 summer,
 food,
 ene]

In [39]:
num=[]
for token in doc:
    if token.pos_ in ["NUM"]:
        num.append(token)
num

[8.3, 8.1, 1982]

In [40]:
for k,v in c.items():
    print(doc.vocab[k].text,'->',v)  #counts..

PROPN -> 16
PUNCT -> 37
NOUN -> 69
VERB -> 21
ADJ -> 35
ADP -> 43
PRON -> 1
NUM -> 54
SPACE -> 10
SYM -> 20
DET -> 12
AUX -> 4
CCONJ -> 2


In [41]:
# how to Train a Spacy NER Model?

In [64]:
data = [
    ("Elon Musk is the founder of SpaceX.", {'entities': [(0, 10, 'PERSON'), (29, 34, 'ORG')]}),
    ("The Louvre Museum in Paris displays famous artworks.", 
     {'entities': [(4, 17, 'LOCATION'), (31, 46, 'ACTIVITY')]}),
    ("Amazon Web Services (AWS) offers cloud computing solutions.", 
     {'entities': [(0, 24, 'ORG'), (38, 55, 'PRODUCT')]}),
    ("Barack Obama served as the 44th President of the United States.", 
     {'entities': [(0, 12, 'PERSON'), (40, 58, 'POSITION'), (63, 78, 'LOCATION')]}),
    ("The iPhone 13, produced by Apple, is a popular smartphone.", 
     {'entities': [(4, 12, 'PRODUCT'), (28, 33, 'ORG'), (57, 67, 'PRODUCT')]}),
    ("Tokyo Disneyland, located in Urayasu, Japan, is a famous theme park.", 
     {'entities': [(0, 15, 'LOCATION'), (30, 37, 'LOCATION'), (49, 60, 'LOCATION'), (68, 78, 'ACTIVITY')]}),
    ("The Nobel Prize is awarded annually for outstanding achievements in various fields.", 
     {'entities': [(4, 15, 'EVENT'), (44, 61, 'ACTIVITY')]}),
    ("Harvard University, situated in Cambridge, Massachusetts, is a prestigious institution.", 
     {'entities': [(0, 17, 'ORG'), (30, 40, 'LOCATION'), (56, 73, 'DESIGNATION')]}),
    ("The Eiffel Tower in France attracts millions of tourists every year.", 
     {'entities': [(4, 15, 'LOCATION'), (19, 25, 'LOCATION'), (54, 64, 'ACTIVITY')]}),
    ("Microsoft Excel is widely used for spreadsheet calculations.", 
     {'entities': [(0, 16, 'PRODUCT'), (26, 42, 'ACTIVITY')]}),
    ("Albert Einstein, known for his theory of relativity, was a famous physicist.", 
     {'entities': [(0, 13, 'PERSON'), (42, 51, 'ACTIVITY')]}),
    ("The Statue of Liberty, located in New York Harbor, is a symbol of freedom.", 
     {'entities': [(4, 21, 'LOCATION'), (34, 50, 'LOCATION'), (59, 66, 'CONCEPT')]}),
    ("Google Maps is a popular navigation app for smartphones.", 
     {'entities': [(0, 12, 'PRODUCT'), (42, 52, 'PRODUCT')]}),
    ("The Berlin Wall, once dividing East and West Berlin, fell in 1989.", 
     {'entities': [(4, 15, 'LOCATION'), (38, 54, 'EVENT')]}),
    ("Michelle Obama, former First Lady of the United States, promotes education.", 
     {'entities': [(0, 14, 'PERSON'), (30, 51, 'POSITION'), (57, 67, 'ACTIVITY')]})
]

In [82]:
type(data)

list

In [65]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from spacy.tokens import DocBin  #used for serialization and deserialization

In [66]:
nlp = spacy.load("en_core_web_sm")
db = DocBin()  # Creating a DocBin object, converting doc file into binary format to easily save the file in disk

for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in annot.get('entities', []):  # Use .get() to provide a default empty list
        try:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                ents.append(span)
            else:
                print("Skipping entity with incorrect format:", start, end, label)
        except ValueError:
            print("Error processing entity:", start, end, label)

    ents=spacy.util.filter_spans(ents)
    doc.ents = ents
    db.add(doc)

os.chdir(r'C:\Users\KIIT\Documents\Python Scripts\spacy_v3 TrainingNER')
db.to_disk("./ner.spacy")


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 908.51it/s]

Skipping entity with incorrect format: 29 34 ORG
Skipping entity with incorrect format: 63 78 LOCATION
Skipping entity with incorrect format: 57 67 PRODUCT
Skipping entity with incorrect format: 68 78 ACTIVITY
Skipping entity with incorrect format: 44 61 ACTIVITY
Skipping entity with incorrect format: 30 40 LOCATION
Skipping entity with incorrect format: 19 25 LOCATION
Skipping entity with incorrect format: 42 51 ACTIVITY
Skipping entity with incorrect format: 42 52 PRODUCT
Skipping entity with incorrect format: 57 67 ACTIVITY





In [None]:
# nlp = spacy.load("en_core_web_sm") # load other spacy model

# db = DocBin() # create a DocBin object

# for text, annot in tqdm(TRAIN_DATA): # data in previous format
#     doc = nlp.make_doc(text) # create doc object from text
#     ents = []
#     for start, end, label in annot["entities"]: # add character indexes
#         span = doc.char_span(start, end, label=label, alignment_mode="contract")
#         if span is None:
#             print("Skipping entity")
#         else:
#             ents.append(span)
#     doc.ents = ents # label the text with the ents
#     db.add(doc)

In [67]:
#creating config file to training model
# can find more on www.spacy.io/usage/training#config, for more components like tagger, parser etc(custom training)


# download the config file and store it in the location above used

In [68]:
#now u have to fill the config base file, bcoz many of details reqrd by compenent(ner) might not be present

In [69]:
!python -m spacy init fill-config base_model.cfg config.cfg

# you ll find config.cfg file 
#now that particular config file got filled for training, now all the info will be stored in config.cfg file
#additional things got filled like, learn_rate, optimizer etc

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [70]:
!python -m spacy train config.cfg --output ./output --paths.train ./ner.spacy --paths.dev ./ner.spacy 

#model trained

[38;5;4m[i] Saving to output directory: output[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     52.68    0.00    0.00    0.00    0.00
100     200         64.46   1856.28  100.00  100.00  100.00    1.00
200     400          0.00      0.00  100.00  100.00  100.00    1.00
380     600          0.00      0.00  100.00  100.00  100.00    1.00
580     800          0.00      0.00  100.00  100.00  100.00    1.00
780    1000          0.00      0.00  100.00  100.00  100.00    1.00
980    1200          0.00      0.00  100.00  100.00  100.00    1.00
1180    1400         24.98     12.40  100.00  100.00  100.00    1.00
1380    1600         26.90      9.73  100.00  100.00  100.00    1.00
1580    1800         31.68      2.

In [71]:
best_model_path = "C:/Users/KIIT/Documents/Python Scripts/spacy_v3 TrainingNER/output/model-best" 
best_nlp = spacy.load(best_model_path)

In [76]:
text = "Yuvraj Singh went to America"
best_doc = best_nlp(text)
for ent in best_doc.ents:
    print(ent.text, "->", ent.label_, "->", spacy.explain(ent.label_))
displacy.render(best_doc,style='ent')

Yuvraj Singh -> PERSON -> People, including fictional
to America -> LOCATION -> None
