# Spacy

In [None]:
import spacy

## Sentence Tokenization

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("My name is Mr. Waqas Ahmed. I love doing coding and hanging out with my friends.")

for sentence in doc.sents:
  print(sentence)

My name is Mr. Waqas Ahmed.
I love doing coding and hanging out with my friends.


## Word Tokenization

In [None]:
for sentence in doc.sents:
  for word in sentence:
    print(word)

My
name
is
Mr.
Waqas
Ahmed
.
I
love
doing
coding
and
hanging
out
with
my
friends
.


# NLTK

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Sentence Tokenization

In [None]:
from nltk.tokenize import sent_tokenize

sent_tokenize("My name is Mr. Waqas Ahmed. I love doing coding and hanging out with my friends.")

['My name is Mr. Waqas Ahmed.',
 'I love doing coding and hanging out with my friends.']

## Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

word_tokenize("My name is Mr. Waqas Ahmed. I love doing coding and hanging out with my friends.")

['My',
 'name',
 'is',
 'Mr.',
 'Waqas',
 'Ahmed',
 '.',
 'I',
 'love',
 'doing',
 'coding',
 'and',
 'hanging',
 'out',
 'with',
 'my',
 'friends',
 '.']

# Tokenization in Spacy(Other Lec.)

In [None]:
import spacy

In [None]:
nlp = spacy.blank("en")

doc = nlp("My name is Waqas Ahmed. I love doing coding and hanging out with my friends.")

for token in doc:
  print(token)

My
name
is
Waqas
Ahmed
.
I
love
doing
coding
and
hanging
out
with
my
friends
.


In [None]:
doc[3]

Waqas

In [None]:
with open ("email.txt") as f:
  text = f.readlines()
text

['Hello there!\n',
 '\n',
 "We hope this message finds you well. We're excited to share some updates and news with you. Our team has been working tirelessly on upcoming projects, and we can't wait to reveal them.\n",
 '\n',
 'For any inquiries or if you want to stay in the loop, feel free to reach out to us at info@yourcompany.com. Additionally, you can contact our team members directly: john.doe@example.com, sarah.smith@example.com, support_team@example.com, and marketing_team@example.com.\n',
 '\n',
 'Looking forward to hearing from you and sharing the latest developments!\n',
 '\n',
 'Best regards,\n',
 '\n',
 'Your Company']

In [None]:
text = ' '.join(text)
text

"Hello there!\n \n We hope this message finds you well. We're excited to share some updates and news with you. Our team has been working tirelessly on upcoming projects, and we can't wait to reveal them.\n \n For any inquiries or if you want to stay in the loop, feel free to reach out to us at info@yourcompany.com. Additionally, you can contact our team members directly: john.doe@example.com, sarah.smith@example.com, support_team@example.com, and marketing_team@example.com.\n \n Looking forward to hearing from you and sharing the latest developments!\n \n Best regards,\n \n Your Company"

In [None]:
doc = nlp(text)
emails = []

for token in doc:
  if token.like_email:
    emails.append(token.text)

emails

['info@yourcompany.com',
 'john.doe@example.com',
 'sarah.smith@example.com',
 'support_team@example.com',
 'marketing_team@example.com']

## Customize Tokenization Rules

In [None]:
doc = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [None]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme",[
    {ORTH: "gim"},
    {ORTH: "me"}
])

doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Spacy Language Processing Pipeline

In [None]:
import spacy

In [None]:
nlp = spacy.blank("en")

doc = nlp("Captain America ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
  print(token)

Captain
America
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f4ace2eeda0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f49d1798340>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f49cb9bb920>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f49d17c4dc0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f49d17c7540>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f49cb9bb7d0>)]

In [None]:
doc = nlp("Captain America ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
  print(token, " | ", token.pos_, " | ", token.lemma_)

Captain  |  PROPN  |  Captain
America  |  PROPN  |  America
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


## NER

In [None]:
doc = nlp("Tesla Inc is gooing to acquire Twitter for $45 billion.")

for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


In [None]:
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is gooing to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n.</div>'

## Adding custom pipeline

In [None]:
src_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")

nlp.add_pipe("ner", source = src_nlp)
nlp.pipe_names

['ner']

In [None]:
doc = nlp("Tesla Inc is gooing to acquire Twitter for $45 billion.")

for ent in doc.ents:
  print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


# Stemming and Lemmatization

In [None]:
import nltk
import spacy

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [None]:
words = ["eating", "ate", "eat", "ability", "adjustable", "rafting", "Meeting"]

for word in words:
  print(word, " | ", stemmer.stem(word))

eating  |  eat
ate  |  ate
eat  |  eat
ability  |  abil
adjustable  |  adjust
rafting  |  raft
Meeting  |  meet


In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating ate eat ability adjustable rafting Meeting")

for token in doc:
  print(token, " | ", token.lemma_)

eating  |  eat
ate  |  ate
eat  |  eat
ability  |  ability
adjustable  |  adjustable
rafting  |  raft
Meeting  |  Meeting


In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Part of Speech Tagging (POS)

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala wit him")

for token in doc:
  print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
mars  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
carried  |  VERB  |  verb
biryani  |  ADJ  |  adjective
masala  |  NOUN  |  noun
wit  |  NOUN  |  noun
him  |  PRON  |  pronoun


In [None]:
doc = nlp("Wow! I won five thosand prixe bond")

for token in doc:
  print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
I  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
won  |  VERB  |  verb  |  VBD  |  verb, past tense
five  |  NUM  |  numeral  |  CD  |  cardinal number
thosand  |  NOUN  |  noun  |  NN  |  noun, singular or mass
prixe  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
bond  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular


## Remving Unecessary tokens from Meta Eraning Report

In [None]:
report = """
MENLO PARK, Calif. – July 26, 2023 – Meta Platforms, Inc. (Nasdaq: META) today reported financial results for the
quarter ended June 30, 2023.
"We had a good quarter. We continue to see strong engagement across our apps and we have the most exciting roadmap I've
seen in a while with Llama 2, Threads, Reels, new AI products in the pipeline, and the launch of Quest 3 this fall," said Mark
Zuckerberg, Meta founder and CEO.
Second Quarter 2023 Financial Highlights
Three Months Ended June 30,
In millions, except percentages and per share amounts 2023 2022 % Change
Revenue $ 31,999 $ 28,822 11%
Costs and expenses 22,607 20,464 10%
Income from operations $ 9,392 $ 8,358 12%
Operating margin 29 % 29 %
Provision for income taxes $ 1,505 $ 1,499 —%
Effective tax rate 16 % 18 %
Net income $ 7,788 $ 6,687 16%
Diluted earnings per share (EPS) $ 2.98 $ 2.46 21%
"""

In [None]:
doc = nlp(report)

filtered_tokens = []

for token in doc:
  if token.pos_ not in ["SPACE", "X", "PUNCT"]:
    filtered_tokens.append(token)

In [None]:
filtered_tokens[:10]

[MENLO, PARK, Calif., July, 26, 2023, Meta, Platforms, Inc., Nasdaq]

### No of POS

In [None]:
count = doc.count_by(spacy.attrs.POS)
count

{103: 17,
 96: 28,
 97: 25,
 93: 31,
 92: 40,
 100: 13,
 84: 7,
 85: 11,
 90: 7,
 95: 5,
 94: 1,
 89: 5,
 86: 1,
 87: 1,
 98: 1,
 99: 10}

In [None]:
for k, v in count.items():
  print(doc.vocab[k].text, "|", v)

SPACE | 17
PROPN | 28
PUNCT | 25
NUM | 31
NOUN | 40
VERB | 13
ADJ | 7
ADP | 11
DET | 7
PRON | 5
PART | 1
CCONJ | 5
ADV | 1
AUX | 1
SCONJ | 1
SYM | 10


## Named Entity Recognition(NER)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
doc = nlp("Tesla Inc is going to acquire the twitter inc for $45 billion")

for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
the twitter inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [None]:
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    the twitter inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"

<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    the twitter inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">

In [None]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [None]:
doc = nlp("Michael Bloomberg founded Bloomberg Inc in 1982")

for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Michael Bloomberg | PERSON | People, including fictional
Bloomberg Inc | ORG | Companies, agencies, institutions, etc.
1982 | DATE | Absolute or relative dates or periods


### Span in spaCy

In [None]:
doc = nlp("Tesla is going to acquire the twitter for $45 billion")

for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [None]:
from spacy.tokens import Span

s1 = Span(doc, 6,7, label="ORG")

doc.set_ents([s1], default="unmodified")

In [None]:
for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | ORG | Companies, agencies, institutions, etc.
the | ORG | Companies, agencies, institutions, etc.
twitter | ORG | Companies, agencies, institutions, etc.
for | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit
