In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN nsubj
startup VERB ccomp
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [3]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [4]:
doc1 = nlp('"Let\'s go to N.Y.!"')
for token in doc1:
    print(token.text)
    

"
Let
's
go
to
N.Y.
!
"


In [5]:
spacy.explain('SYM')

'symbol'

In [6]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [7]:
word = nlp("intelligence")
print(word.vector)

[-0.3525915  -0.5595864   1.3269742  -0.4591393  -0.2378951   0.35304713
  0.9810033   0.9120197  -0.7636418   0.53343236  0.7111834  -0.4101859
 -0.22609742  0.21238515  0.1977409   0.5262209  -1.1578097  -0.6223494
  0.04481605 -0.9658006   0.11590984  1.0158504  -0.81331897 -0.06708208
  0.46827182  0.55026615  1.0141313   0.18985981 -0.32852772  1.5085963
 -0.68311155  0.2897485  -0.17085919  0.51104367 -0.41968206 -0.44074327
 -0.00220814  0.44290805  0.10646416  0.26499194 -0.5972402  -0.17584111
  0.09915334  1.245649   -0.0993301   0.03572413 -0.94779813 -0.7312204
 -0.3475603  -1.3627207   0.16587363 -0.22425665  0.8468491  -1.3706408
  1.0920432  -0.5718084   0.7467524  -0.34398675 -0.24239466  0.1408653
 -0.72588027  0.11295792  0.13626975 -0.8679844   0.21890712 -0.10783374
  0.71108097  0.06787673 -0.3360198  -0.547363    0.11034714  0.55276895
  0.34893125  0.79479325  0.1372495  -0.5995597  -0.12229609 -0.49462122
 -0.17136315  0.2365407  -0.7538129  -0.51677805 -0.87205

In [8]:
print(word.vector_norm)

6.312396882270653


In [9]:
doc1 = nlp("Oh, great, another large language model.")
doc2 = nlp("Another large language? Just what I needed!")
print(doc1, "<->", doc2, doc1.similarity(doc2))

Oh, great, another large language model. <-> Another large language? Just what I needed! 0.5156556367874146


  print(doc1, "<->", doc2, doc1.similarity(doc2))


In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [11]:
doc = nlp("The impact of artificial intelligence on society is profound.")
print(doc.vocab.strings['intelligence']) # 11044490816763727375
print(doc.vocab.strings[11044490816763727375]) # intelligence

11044490816763727375
intelligence


In [12]:
doc = nlp("I love coffee")
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


In [13]:
intro = "This is a Real Python tutorial on spaCy."
intro_doc = nlp(intro)
[token.text for token in intro_doc]

['This', 'is', 'a', 'Real', 'Python', 'tutorial', 'on', 'spaCy', '.']

In [14]:
type(intro)

str

In [15]:
type(intro_doc)

spacy.tokens.doc.Doc

In [16]:
import pathlib
file_name = pathlib.Path("Data/nvidia2025q3.txt")
text = file_name.read_text(encoding="utf-8")
nvidia_doc = nlp(text)

In [17]:
[token.text for token in nvidia_doc]

['Nvidia',
 '(',
 '\n',
 'NVDA',
 '\n',
 '+2.61',
 '%',
 '\n',
 ')',
 '\n',
 'Q3',
 '2025',
 'Earnings',
 'Call',
 '\n',
 'Nov',
 '20',
 ',',
 '2024',
 ',',
 '5:00',
 'p.m.',
 'ET',
 '\n\n',
 'Contents',
 ':',
 '\n',
 'Prepared',
 'Remarks',
 '\n',
 'Questions',
 'and',
 'Answers',
 '\n',
 'Call',
 'Participants',
 '\n',
 'Prepared',
 'Remarks',
 ':',
 '\n\n',
 'Operator',
 '\n\n',
 'Good',
 'afternoon',
 '.',
 'My',
 'name',
 'is',
 'Jay',
 ',',
 'and',
 'I',
 "'ll",
 'be',
 'your',
 'conference',
 'operator',
 'today',
 '.',
 'At',
 'this',
 'time',
 ',',
 'I',
 'would',
 'like',
 'to',
 'welcome',
 'everyone',
 'to',
 'NVIDIA',
 "'s",
 'third',
 '-',
 'quarter',
 'earnings',
 'call',
 '.',
 'All',
 'lines',
 'have',
 'been',
 'placed',
 'on',
 'mute',
 'to',
 'prevent',
 'any',
 'background',
 'noise',
 '.',
 '\n\n',
 'After',
 'the',
 'speakers',
 "'",
 'remarks',
 ',',
 'there',
 'will',
 'be',
 'a',
 'question',
 '-',
 'and',
 '-',
 'answer',
 'session',
 '.',
 '[',
 'Operator',


In [18]:
# Extract all sentences and put them in a list
sentences = list(nvidia_doc.sents)
len(sentences)

508

In [19]:
# print the first few tokens for selected sentences
for sentence in sentences[10:15]:
    print(sentence[:8])

I'd like to remind you that our
The webcast will be available for replay until
The content of today's call is NVIDIA
It can't be reproduced or transcribed without
During this call, we may make forward


In [20]:
# Sample text
ellipsis_text = (
    "Guys, can you please, ... never mind, I forgot"
    "what I was saying. So, do you think"
    " we should..."
)

In [21]:
# Define a custom delimiter to break up sentences
from spacy.language import Language

# Decorator
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

# Custom nlp object
custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)

Guys, can you please, ...
never mind, I forgotwhat I was saying.
So, do you think we should...


In [22]:
for token in custom_ellipsis_doc:
    print(token, token.idx)

Guys 0
, 4
can 6
you 10
please 14
, 20
... 22
never 26
mind 32
, 36
I 38
forgotwhat 40
I 51
was 53
saying 57
. 63
So 65
, 67
do 69
you 72
think 76
we 82
should 85
... 91


In [23]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':15}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
)

for token in custom_ellipsis_doc:
    print((
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    ))

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Guys                  True           False             False
,                     False          True              False
can                   True           False             True
you                   True           False             True
please                True           False             True
,                     False          True              False
...                   False          True              False
never                 True           False             True
mind                  True           False             False
,                     False          True              False
I                     True           False             True
forgotwhat            True           False             False
I                     True           False             True
was                   True           False             True
saying                True           False             False
.                     

In [24]:
# Custom text with @infix
custom_about_text = (
    "This is a London@based FinTech company."
)

print([token.text for token in nlp(custom_about_text)])

['This', 'is', 'a', 'London@based', 'FinTech', 'company', '.']


In [25]:
# Use default prefix, suffix, but customize infix
import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)
custom_infixes = [f"@"]
infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    custom_nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)
print([token.text for token in custom_tokenizer_about_doc])



['This', 'is', 'a', 'London', '@', 'based', 'FinTech', 'company', '.']


In [26]:
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

# Load the default English model
nlp = spacy.load("en_core_web_sm")

# Add "@" as a custom infix (split inside tokens)
custom_infixes = list(nlp.Defaults.infixes) + [r"@"]
infix_re = compile_infix_regex(custom_infixes)

# Create a new tokenizer that only overrides the infix rules
nlp.tokenizer = Tokenizer(
    nlp.vocab,
    prefix_search=nlp.tokenizer.prefix_search,
    suffix_search=nlp.tokenizer.suffix_search,
    infix_finditer=infix_re.finditer,
    token_match=nlp.tokenizer.token_match,
)

In [27]:
custom_tokenizer_about_doc = nlp(custom_about_text)
print([token.text for token in custom_tokenizer_about_doc])

['This', 'is', 'a', 'London', '@', 'based', 'FinTech', 'company', '.']


In [28]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [29]:
for word in list(spacy_stopwords)[:5]:
    print(word)

first
next
even
thence
a


In [30]:
print([token for token in custom_ellipsis_doc if not token.is_stop])

[Guys, ,, ,, ..., mind, ,, forgotwhat, saying, ., ,, think, ...]


In [31]:
for token in custom_ellipsis_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20}: {str(token.lemma_)}")

                Guys: guy
                 was: be
              saying: say
                  So: so


In [32]:
# Counting for word frequencies, excl stop words, punctuation, and line breaks
from collections import Counter

words_freq = [
    token.text
    for token in nvidia_doc
    if not token.is_stop and not token.is_punct and not token.is_space
]
print(Counter(words_freq).most_common(5))

[('AI', 109), ('NVIDIA', 59), ('Blackwell', 49), ('year', 43), ('data', 43)]


In [33]:
for token in custom_ellipsis_doc:
    print(
        f"""
    Token: {token.text}
    ==========================
    Tag: {token.tag_:10} POS: {token.pos_}
    Explanation: {spacy.explain(token.tag_)}
    """)


    Token: Guys
    Tag: NNS        POS: NOUN
    Explanation: noun, plural
    

    Token: ,
    Tag: ,          POS: PUNCT
    Explanation: punctuation mark, comma
    

    Token: can
    Tag: MD         POS: AUX
    Explanation: verb, modal auxiliary
    

    Token: you
    Tag: PRP        POS: PRON
    Explanation: pronoun, personal
    

    Token: please
    Tag: VB         POS: VERB
    Explanation: verb, base form
    

    Token: ,
    Tag: ,          POS: PUNCT
    Explanation: punctuation mark, comma
    

    Token: ...
    Tag: :          POS: PUNCT
    Explanation: punctuation mark, colon or ellipsis
    

    Token: never
    Tag: RB         POS: ADV
    Explanation: adverb
    

    Token: mind
    Tag: VBP        POS: VERB
    Explanation: verb, non-3rd person singular present
    

    Token: ,
    Tag: ,          POS: PUNCT
    Explanation: punctuation mark, comma
    

    Token: I
    Tag: PRP        POS: PRON
    Explanation: pronoun, personal
    

    Token:

In [34]:
nouns = []
adjs = []
for token in nvidia_doc:
    if token.pos_ == "NOUN":
        nouns.append(token.text)
    if token.pos_ == "ADJ":
        adjs.append(token.text)
print(f"Number of nouns: {len(nouns)}")
print(f"Number of adjectives: {len(adjs)}")

# count frequency of adjectives
adj_counts = Counter(adjs)

# get top five most frequent adjectives
top5_adj = adj_counts.most_common(5)
print("Most frequent adjectives:")
for word, freq in top5_adj:
    print(f"{word}: {freq}")


Number of nouns: 1819
Number of adjectives: 634
Most frequent adjectives:
new: 33
next: 26
first: 19
more: 17
large: 15


In [35]:
from spacy import displacy
from IPython.display import HTML, display

# Get HTML
html = displacy.render(custom_ellipsis_doc, style="dep", 
                       options={"distance": 90}, jupyter=False)

# Display directly in Jupyter notebook
display(HTML(html))


In [36]:
# Data preprocessing

# Define a Boolean function to filter tokens
def allowed_token(token):
    return bool(
        token
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
    )

# Define a function to lemmatize, strip, and lowercase tokens
def preprocess_token(token):
    return token.lemma_.strip().lower()

filtered_tokens = [
    preprocess_token(token)
    for token in custom_ellipsis_doc
    if allowed_token(token)
]

print(filtered_tokens)

['guy', 'mind', 'forgotwhat', 'say', 'think']


In [37]:
# Use rule-based matching to find full names in a text
from spacy.matcher import Matcher

def extract_full_names(nlp_doc):
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("FULL_NAME", [pattern]) # FULL_NAME is just a placeholder
    matches = matcher(nlp_doc) # matches is a list of tuples
    for _, start, end in matches:
        yield nlp_doc[start:end].text


In [38]:
# Print the first five full names found in the NVIDIA document
for i, name in enumerate(extract_full_names(nvidia_doc), start=1):
    if i > 5:
        break
    print(name)

Earnings Call
Call Participants
Stewart Stecker
Stewart Stecker
Senior Director


In [39]:
import sys
print(sys.version)

3.11.14 (main, Oct 31 2025, 23:04:14) [Clang 21.1.4 ]
