In [4]:
## https://spacy.io/usage

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# A lot of wikipedia data gets included in Wikipedia results

In [46]:
doc = nlp('''Cricket has been the most popular sport in India, it is played almost everywhere in the country[4][unreliable source?] and a prominent part of the country. The Board of Control for Cricket in India (BCCI) is the governing body of Indian cricket and conduct all domestic tournaments and select the players for India national cricket team and India women's national cricket team.

Domestic competitions in India annually organized by BCCI include the Ranji Trophy, the Duleep Trophy, the Vijay Hazare Trophy, the Deodhar Trophy, the Irani Trophy and the NKP Salve Challenger Trophy. The Indian Premier League, a Twenty20 tournament where various city-based franchises compete in a style similar to American football, is one of the biggest sporting leagues and the biggest cricketing league in the world. In 2023 it launched a similar league for females, the Women's Premier League (WPL).

International cricket in India does not follow a consistent pattern, unlike other cricketing teams such as England, who tour other countries during the winter and play at home during the summer. The Indian cricket team is one of the most successful cricket teams in the world, having won 2 ICC World Cups, 1 ICC World Twenty20, 2 ICC Champion's Trophies and finished runners up in the inaugural edition of the ICC World Test Championship. The 2021 ICC Men's T20 World Cup, was initially meant to be hosted by India. However, after the escalation of the COVID-19 Pandemic in India, the tournament was moved by the ICC to the United Arab Emirates. The 2023 Cricket World Cup will be hosted by India''')

In [8]:
for token in doc[0:20]:
    print(token)
    
# Seperates Punctuation Marks

Cricket
has
been
the
most
popular
sport
in
India
,
it
is
played
almost
everywhere
in
the
country[4][unreliable
source
?


In [9]:
## Sentence Boundary Detection

In [11]:
for x,sent in enumerate(doc.sents,start=1):
    print(x,sent)

1 Cricket has been the most popular sport in India, it is played almost everywhere in the country[4][unreliable source?] and a prominent part of the country.
2 The Board of Control for Cricket in India (BCCI) is the governing body of Indian cricket and conduct all domestic tournaments and select the players for India national cricket team and India women's national cricket team.


In [None]:
# Tokens have metadata

In [12]:
sentence1 = list(doc.sents)[0]

In [13]:
sentence1

Cricket has been the most popular sport in India, it is played almost everywhere in the country[4][unreliable source?] and a prominent part of the country.

In [23]:
token_meta = sentence1[8]

In [24]:
token_meta

India

In [16]:
token_meta.text

'Cricket'

In [25]:
token_meta.left_edge

India

In [26]:
token_meta.right_edge

India

In [27]:
token_meta.ent_type_
# GeoPoliticalEntity

'GPE'

In [29]:
token_meta.ent_iob_
# B - Beggining
# I - Inside

'B'

In [30]:
token_meta.lemma_
# Verb

'India'

In [31]:
token_meta.morph

Number=Sing

In [32]:
token_meta.pos_

'PROPN'

In [33]:
token_meta.dep_

'pobj'

In [34]:
token_meta.lang_

'en'

In [35]:
## Part of Speech Tagging

In [36]:
text ="Mike enjoys playing football."

doc2 = nlp(text)


In [38]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj
. PUNCT punct


In [40]:
from spacy import displacy
displacy.render(doc2,style="dep")

In [41]:
# Named Entity Recognition

In [47]:
for ent in doc.ents:
    print(ent.text,ent.label_)

India GPE
The Board of Control for Cricket ORG
India GPE
BCCI ORG
Indian NORP
India GPE
India GPE
India GPE
annually DATE
BCCI ORG
the Ranji Trophy ORG
the Duleep Trophy ORG
the Vijay Hazare Trophy ORG
the Deodhar Trophy ORG
the Irani Trophy ORG
Trophy PERSON
Indian NORP
Twenty20 PERSON
American NORP
2023 DATE
WPL ORG
India GPE
England GPE
the summer DATE
Indian NORP
2 CARDINAL
World Cups ORG
1 CARDINAL
the ICC World Test Championship ORG
2021 DATE
World Cup EVENT
India GPE
India GPE
ICC ORG
the United Arab Emirates GPE
2023 DATE
Cricket World Cup EVENT
India GPE


In [48]:
displacy.render(doc,style="ent")

In [51]:
# Medium Model has Word Vectors
# Numerical Representation of Words in the form of vectors

In [49]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [54]:
sentence1.vector

array([ 0.35890368, -0.16271242,  0.20912889, -0.20919901, -0.06066184,
        0.03035078,  0.27560207, -0.06766222,  0.0935057 ,  0.18896103,
        0.01711656,  0.06415585, -0.1723149 ,  0.14886455, -0.21558268,
       -0.14486037, -0.06156819,  0.02636736, -0.1098373 ,  0.0296547 ,
       -0.4440996 ,  0.11933516,  0.00134941, -0.339413  ,  0.00200109,
       -0.30598047,  0.05316748, -0.0790891 ,  0.24255942,  0.32229388,
       -0.01376798,  0.118565  ,  0.36762503, -0.21615063,  0.0323898 ,
       -0.17759195, -0.00795953,  0.05055583, -0.2302437 ,  0.06148341,
       -0.22314955,  0.293635  , -0.27913988,  0.18758985,  0.08979087,
        0.22825968, -0.18438338,  0.17707743,  0.26175582, -0.01007985,
       -0.40695184,  0.12417703, -0.11023957,  0.13562833,  0.02028113,
        0.09974566, -0.2321743 ,  0.02460547,  0.10044649, -0.20556335,
        0.03435546, -0.5714882 ,  0.05441915, -0.173794  ,  0.00724508,
        0.00816868,  0.1330781 , -0.42415947,  0.35861012, -0.02

In [93]:
nlp = spacy.load("en_core_web_md")

In [94]:
doc = nlp('''Cricket has been the most popular sport in India, it is played almost everywhere in the country[4][unreliable source?] and a prominent part of the country. The Board of Control for Cricket in India (BCCI) is the governing body of Indian cricket and conduct all domestic tournaments and select the players for India national cricket team and India women's national cricket team.

Domestic competitions in India annually organized by BCCI include the Ranji Trophy, the Duleep Trophy, the Vijay Hazare Trophy, the Deodhar Trophy, the Irani Trophy and the NKP Salve Challenger Trophy. The Indian Premier League, a Twenty20 tournament where various city-based franchises compete in a style similar to American football, is one of the biggest sporting leagues and the biggest cricketing league in the world. In 2023 it launched a similar league for females, the Women's Premier League (WPL).

International cricket in India does not follow a consistent pattern, unlike other cricketing teams such as England, who tour other countries during the winter and play at home during the summer. The Indian cricket team is one of the most successful cricket teams in the world, having won 2 ICC World Cups, 1 ICC World Twenty20, 2 ICC Champion's Trophies and finished runners up in the inaugural edition of the ICC World Test Championship. The 2021 ICC Men's T20 World Cup, was initially meant to be hosted by India. However, after the escalation of the COVID-19 Pandemic in India, the tournament was moved by the ICC to the United Arab Emirates. The 2023 Cricket World Cup will be hosted by India''')

In [95]:
sentence2 = list(doc.sents)[0]

In [84]:
print(sentence2)

Cricket has been the most popular sport in India, it is played almost everywhere in the country[4][unreliable source?] and a prominent part of the country.


In [85]:
# How the Word County

In [86]:
import numpy as np

In [113]:
your_word = "breakfast"

In [115]:


ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]),n=20)
ms

(array([[15981607639113716751,  8869128298214175138, 17092812801561910730,
          6119697410033091711, 11752932193150848035,  8833592706648090133,
          8640763535512703744,  1012341554490024974, 15650559750608259320,
         13556672019339779060, 16252485431428339663,  2306090229986738985,
         10981249691051785531, 14404572306436076204, 17502178374224329708,
         13277673080900090288, 13071974745762384372, 11549930039359342423,
         12399415755182795362, 18111588618986575719]], dtype=uint64),
 array([[ 3317,  2888,  3319,  1837, 17127, 10614,  4624,  9333, 10502,
         12860, 17084,  8708, 17339,  8651,  3437, 15608,  4530,  1632,
         14974,  5069]], dtype=int32),
 array([[1.    , 0.7939, 0.7588, 0.747 , 0.7264, 0.7257, 0.7138, 0.708 ,
         0.6978, 0.695 , 0.6829, 0.6695, 0.6462, 0.6375, 0.6349, 0.6293,
         0.6249, 0.6223, 0.6179, 0.611 ]], dtype=float32))

In [88]:
nlp.vocab.strings[your_word]

12290671265767728302

In [89]:
nlp.vocab.vectors[nlp.vocab.strings[your_word]][0]

-0.41393

In [90]:
np.asarray(nlp.vocab.vectors[nlp.vocab.strings[your_word]][0])

array(-0.41393, dtype=float32)

In [91]:
for w in ms:
    print(ms)

(array([[12389239844680878404,  1435501296278296988,  3205366385982613224,
        10101261077591962824, 10067128433980916117, 13467190378500458811,
         7523086094447079607,  4411440909759659592,  3830018849180425586,
          769100778973147158]], dtype=uint64), array([[  351,  1831,   919,  8453,  4341, 10117,  1955, 14035,   984,
        17926]], dtype=int32), array([[1.    , 0.8009, 0.7833, 0.773 , 0.7341, 0.712 , 0.6996, 0.6951,
        0.6934, 0.6924]], dtype=float32))
(array([[12389239844680878404,  1435501296278296988,  3205366385982613224,
        10101261077591962824, 10067128433980916117, 13467190378500458811,
         7523086094447079607,  4411440909759659592,  3830018849180425586,
          769100778973147158]], dtype=uint64), array([[  351,  1831,   919,  8453,  4341, 10117,  1955, 14035,   984,
        17926]], dtype=int32), array([[1.    , 0.8009, 0.7833, 0.773 , 0.7341, 0.712 , 0.6996, 0.6951,
        0.6934, 0.6924]], dtype=float32))
(array([[1238923984468087840

In [116]:
words = [nlp.vocab.strings[w] for w in ms[0][0]]
   
print(words)

['breakfasting', 'lunchmeat', 'Ameal', 'dinner-', 'ALOX5', 'Hais', 'mealtimes', 'kanack', 'Krunchies', 'Quiches', 'scuppers', 'APHC', 'ragamuffins', 'chaffle', 'delitto', 'bouchons', 'Vrinks', 'cofferdams', 'Suppers', 'dishrag']


## Document Similarity
It find similarities using Word Embeddings

In [118]:
doc1 = nlp("I like salty food")
doc2 = nlp("I like sugar in food")

In [119]:
print(doc1,"<-->",doc2,doc1.similarity(doc2))

I like salty food <--> I like sugar in food 0.8287711390180229


In [121]:
doc3 = nlp("It is an awesome day right now")

In [122]:
print(doc1,"<-->",doc2,doc1.similarity(doc3))

I like salty food <--> I like sugar in food 0.3466165069475381


In [123]:
# Apples, Oranges and Burgers are different

In [130]:
doc6=nlp("rice")
doc7=nlp("dal")

In [131]:
print(doc6,"<-->",doc7,doc6.similarity(doc7))

rice <--> dal 0.6585074001019197


---
# Pipeline
## Spacy Pipeline for NER

In [137]:
nlp = spacy.blank("en")

In [139]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7fd16374a640>

In [140]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}

In [135]:
nlp2 = spacy.load("en_core_web_sm")

In [136]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

---
## Entity Ruler
- Use when we need a label to be associated
- Rules based approach
- Extracting Text or formations of text
- Rules should have a very high level of True Positives
- Once the Entity identifies something, it will not change unless you override
- Rule must be created an overridden

In [177]:
nlp = spacy.load("en_core_web_sm")

In [164]:
text = "Wen Chesterter is a place that Sunil likes. He lives in the West of Dubai"

In [183]:
doc = nlp(text)

In [184]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Wen Chesterter GPE
Sunil GPE
West LOC
Dubai GPE


In [178]:
ruler = nlp.add_pipe("entity_ruler",before="ner")

In [180]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [181]:
patterns = [{"label":"GPE","pattern":"Wen Chesterter"}]

In [182]:
ruler.add_patterns(patterns)

In [185]:
## Toponym - The issue of the same words for different context
## Eg: Paris

---

## Matcher
- Uses Linguistic Features to extract information
- Will store the information in the vocab of the nlp model
- Not necessary an Entity Type

https://spacy.io/api/matcher#_title

In [187]:
from spacy.matcher import Matcher

In [238]:
nlp = spacy.load("en_core_web_sm")

In [239]:
matcher = Matcher(nlp.vocab)
pattern1 = [{"LIKE_EMAIL":True}]
pattern2 = [{"IS_DIGIT":True}]
matcher.add("EMAIL_ADDRESS",[pattern1])
matcher.add("DIGITS",[pattern2])

In [240]:
doc = nlp("this is an amaxing sunil@gmail.com 123")
matches = matcher(doc)

In [241]:
print(matches)
# Lexeme, Start Token, End Token

[(16571425990740197027, 4, 5), (8243381992774708034, 5, 6)]


In [242]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [249]:
# Text from 4th to 5th Position
# and text from 5th to 6th Position
for match in matches:
    print(doc[match[1]:match[2]])

sunil@gmail.com
123


In [231]:
print(nlp.vocab[matches[0][1]].text)

IS_LOWER


### Real World Case

In [232]:
nlp = spacy.load("en_core_web_sm")

In [234]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN"}]
matcher.add("Proper_Noun",[pattern])

doc = nlp('''Narendra Damodardas Modi (Gujarati: [ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː] (listen); born 17 September 1950)[b] is an Indian politician who has served as the 14th Prime Minister of India since May 2014. Modi was the Chief Minister of Gujarat from 2001 to 2014 and is the Member of Parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister from outside the Indian National Congress.''')

matches = matcher(doc)

print(len(matches))

for match in matches:
    print(match,doc[match[1]:match[2]])

29


(1997218022679078014, 0, 1) Narendra
(1997218022679078014, 1, 2) Damodardas
(1997218022679078014, 2, 3) Modi
(1997218022679078014, 4, 5) Gujarati
(1997218022679078014, 17, 18) September
(1997218022679078014, 30, 31) Prime
(1997218022679078014, 31, 32) Minister
(1997218022679078014, 33, 34) India
(1997218022679078014, 35, 36) May
(1997218022679078014, 38, 39) Modi
(1997218022679078014, 41, 42) Chief
(1997218022679078014, 42, 43) Minister
(1997218022679078014, 44, 45) Gujarat
(1997218022679078014, 52, 53) Member
(1997218022679078014, 54, 55) Parliament
(1997218022679078014, 56, 57) MP
(1997218022679078014, 59, 60) Varanasi
(1997218022679078014, 67, 68) Bharatiya
(1997218022679078014, 68, 69) Janata
(1997218022679078014, 69, 70) Party
(1997218022679078014, 71, 72) BJP
(1997218022679078014, 76, 77) Rashtriya
(1997218022679078014, 77, 78) Swayamsevak
(1997218022679078014, 78, 79) Sangh
(1997218022679078014, 80, 81) RSS
(1997218022679078014, 87, 88) Hindu
(1997218022679078014, 104, 105) Indi

In [250]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN","OP":"+"}]
# Proper Noun that occurs one or more times
matcher.add("Proper_Noun",[pattern])

doc = nlp('''Narendra Damodardas Modi (Gujarati: [ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː] (listen); born 17 September 1950)[b] is an Indian politician who has served as the 14th Prime Minister of India since May 2014. Modi was the Chief Minister of Gujarat from 2001 to 2014 and is the Member of Parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister from outside the Indian National Congress.''')

matches = matcher(doc)

print(len(matches))

for match in matches:
    print(match,doc[match[1]:match[2]])

43
(1997218022679078014, 0, 1) Narendra
(1997218022679078014, 0, 2) Narendra Damodardas
(1997218022679078014, 1, 2) Damodardas
(1997218022679078014, 0, 3) Narendra Damodardas Modi
(1997218022679078014, 1, 3) Damodardas Modi
(1997218022679078014, 2, 3) Modi
(1997218022679078014, 4, 5) Gujarati
(1997218022679078014, 17, 18) September
(1997218022679078014, 30, 31) Prime
(1997218022679078014, 30, 32) Prime Minister
(1997218022679078014, 31, 32) Minister
(1997218022679078014, 33, 34) India
(1997218022679078014, 35, 36) May
(1997218022679078014, 38, 39) Modi
(1997218022679078014, 41, 42) Chief
(1997218022679078014, 41, 43) Chief Minister
(1997218022679078014, 42, 43) Minister
(1997218022679078014, 44, 45) Gujarat
(1997218022679078014, 52, 53) Member
(1997218022679078014, 54, 55) Parliament
(1997218022679078014, 56, 57) MP
(1997218022679078014, 59, 60) Varanasi
(1997218022679078014, 67, 68) Bharatiya
(1997218022679078014, 67, 69) Bharatiya Janata
(1997218022679078014, 68, 69) Janata
(19972180

In [252]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN","OP":"+"}]
# Proper Noun that occurs one or more times
matcher.add("Proper_Noun",[pattern],greedy="LONGEST")
# We added a Gree

doc = nlp('''Narendra Damodardas Modi (Gujarati: [ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː] (listen); born 17 September 1950)[b] is an Indian politician who has served as the 14th Prime Minister of India since May 2014. Modi was the Chief Minister of Gujarat from 2001 to 2014 and is the Member of Parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister from outside the Indian National Congress.''')

matches = matcher(doc)

print(len(matches))

# Adding a sorting
matches.sort(key = lambda x: x[1])

for match in matches:
    print(match,doc[match[1]:match[2]])

19
(1997218022679078014, 0, 3) Narendra Damodardas Modi
(1997218022679078014, 4, 5) Gujarati
(1997218022679078014, 17, 18) September
(1997218022679078014, 30, 32) Prime Minister
(1997218022679078014, 33, 34) India
(1997218022679078014, 35, 36) May
(1997218022679078014, 38, 39) Modi
(1997218022679078014, 41, 43) Chief Minister
(1997218022679078014, 44, 45) Gujarat
(1997218022679078014, 52, 53) Member
(1997218022679078014, 54, 55) Parliament
(1997218022679078014, 56, 57) MP
(1997218022679078014, 59, 60) Varanasi
(1997218022679078014, 67, 70) Bharatiya Janata Party
(1997218022679078014, 71, 72) BJP
(1997218022679078014, 76, 79) Rashtriya Swayamsevak Sangh
(1997218022679078014, 80, 81) RSS
(1997218022679078014, 87, 88) Hindu
(1997218022679078014, 104, 107) Indian National Congress


In [253]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN","OP":"+"},{'POS':'VERB'}]
# Proper Noun that occurs one or more times
matcher.add("Proper_Noun",[pattern],greedy="LONGEST")
# We added a Gree

doc = nlp('''Narendra Damodardas Modi (Gujarati: [ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː] (listen); born 17 September 1950)[b] is an Indian politician who has served as the 14th Prime Minister of India since May 2014. Modi was the Chief Minister of Gujarat from 2001 to 2014 and is the Member of Parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister from outside the Indian National Congress.''')

matches = matcher(doc)

print(len(matches))

# Adding a sorting
matches.sort(key = lambda x: x[1])

for match in matches:
    print(match,doc[match[1]:match[2]])

0
