In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
doc = nlp("Ahmad went to the store to buy some groceries. He bought some apples, oranges, and bananas.")

for token in doc:
    print(token, ":", token.pos_, ":", spacy.explain(token.pos_))

Ahmad : PROPN : proper noun
went : VERB : verb
to : ADP : adposition
the : DET : determiner
store : NOUN : noun
to : PART : particle
buy : VERB : verb
some : DET : determiner
groceries : NOUN : noun
. : PUNCT : punctuation
He : PRON : pronoun
bought : VERB : verb
some : DET : determiner
apples : NOUN : noun
, : PUNCT : punctuation
oranges : NOUN : noun
, : PUNCT : punctuation
and : CCONJ : coordinating conjunction
bananas : NOUN : noun
. : PUNCT : punctuation


- Tags in Spacy

In [6]:
doc = nlp("Ahmad went to the store to buy some groceries. He bought some apples, oranges, and bananas.")

for token in doc:
    print(token, ":", token.pos_, ":", spacy.explain(token.pos_), ":", token.tag_, ":", spacy.explain(token.tag_))

Ahmad : PROPN : proper noun : NNP : noun, proper singular
went : VERB : verb : VBD : verb, past tense
to : ADP : adposition : IN : conjunction, subordinating or preposition
the : DET : determiner : DT : determiner
store : NOUN : noun : NN : noun, singular or mass
to : PART : particle : TO : infinitival "to"
buy : VERB : verb : VB : verb, base form
some : DET : determiner : DT : determiner
groceries : NOUN : noun : NNS : noun, plural
. : PUNCT : punctuation : . : punctuation mark, sentence closer
He : PRON : pronoun : PRP : pronoun, personal
bought : VERB : verb : VBD : verb, past tense
some : DET : determiner : DT : determiner
apples : NOUN : noun : NNS : noun, plural
, : PUNCT : punctuation : , : punctuation mark, comma
oranges : NOUN : noun : NNS : noun, plural
, : PUNCT : punctuation : , : punctuation mark, comma
and : CCONJ : coordinating conjunction : CC : conjunction, coordinating
bananas : NOUN : noun : NNS : noun, plural
. : PUNCT : punctuation : . : punctuation mark, sentence 

In [7]:
earning_text = '''

Press Release & Webcast
Earnings Release FY25 Q2
Microsoft Cloud and AI Strength Drives Second Quarter Results

REDMOND, Wash. — January 29, 2025 — Microsoft Corp. today announced the following results for the quarter ended December 31, 2024, as compared to the corresponding period of last fiscal year:

·         Revenue was $69.6 billion and increased 12%

·         Operating income was $31.7 billion and increased 17% (up 16% in constant currency)

·         Net income was $24.1 billion and increased 10%

·         Diluted earnings per share was $3.23 and increased 10%

“We are innovating across our tech stack and helping customers unlock the full ROI of AI to capture the massive opportunity ahead," said Satya Nadella, chairman and chief executive officer of Microsoft. “Already, our AI business has surpassed an annual revenue run rate of $13 billion, up 175% year-over-year.”

“This quarter Microsoft Cloud revenue was $40.9 billion, up 21% year-over-year,” said Amy Hood, executive vice president and chief financial officer of Microsoft. ”We remain committed to balancing operational discipline with continued investments in our cloud and AI infrastructure.”

Business Highlights

Revenue in Productivity and Business Processes was $29.4 billion and increased 14% (up 13% in constant currency), with the following business highlights:

·         Microsoft 365 Commercial products and cloud services revenue increased 15% driven by Microsoft 365 Commercial cloud revenue growth of 16% (up 15% in constant currency)

·         Microsoft 365 Consumer products and cloud services revenue increased 8% driven by Microsoft 365 Consumer cloud revenue growth of 8%

·         LinkedIn revenue increased 9%

·         Dynamics products and cloud services revenue increased 15% (up 14% in constant currency) driven by Dynamics 365 revenue growth of 19% (up 18% in constant currency)

Revenue in Intelligent Cloud was $25.5 billion and increased 19%, with the following business highlights:

·         Server products and cloud services revenue increased 21% driven by Azure and other cloud services revenue growth of 31%

Revenue in More Personal Computing was $14.7 billion and was relatively unchanged, with the following business highlights:

·         Windows OEM and Devices revenue increased 4%

·         Xbox content and services revenue increased 2%

·         Search and news advertising revenue excluding traffic acquisition costs increased 21% (up 20% in constant currency)

Microsoft returned $9.7 billion to shareholders in the form of dividends and share repurchases in the second quarter of fiscal year 2025.

'''

- Removing all the garbage

In [10]:
doc = nlp(earning_text)

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_))

Press  |  PROPN  |  proper noun
Release  |  PROPN  |  proper noun
&  |  CCONJ  |  coordinating conjunction
Webcast  |  PROPN  |  proper noun
Earnings  |  PROPN  |  proper noun
Release  |  PROPN  |  proper noun
FY25  |  PROPN  |  proper noun
Q2  |  PROPN  |  proper noun
Microsoft  |  PROPN  |  proper noun
Cloud  |  PROPN  |  proper noun
and  |  CCONJ  |  coordinating conjunction
AI  |  PROPN  |  proper noun
Strength  |  PROPN  |  proper noun
Drives  |  VERB  |  verb
Second  |  PROPN  |  proper noun
Quarter  |  PROPN  |  proper noun
Results  |  VERB  |  verb
REDMOND  |  PROPN  |  proper noun
Wash.  |  PROPN  |  proper noun
January  |  PROPN  |  proper noun
29  |  NUM  |  numeral
2025  |  NUM  |  numeral
Microsoft  |  PROPN  |  proper noun
Corp.  |  PROPN  |  proper noun
today  |  NOUN  |  noun
announced  |  VERB  |  verb
the  |  DET  |  determiner
following  |  VERB  |  verb
results  |  NOUN  |  noun
for  |  ADP  |  adposition
the  |  DET  |  determiner
quarter  |  NOUN  |  noun
ended  |

- Saving the tokens in the list

In [12]:
doc = nlp(earning_text)

filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)

filtered_tokens

[Press,
 Release,
 &,
 Webcast,
 Earnings,
 Release,
 FY25,
 Q2,
 Microsoft,
 Cloud,
 and,
 AI,
 Strength,
 Drives,
 Second,
 Quarter,
 Results,
 REDMOND,
 Wash.,
 January,
 29,
 2025,
 Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2024,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 69.6,
 billion,
 and,
 increased,
 12,
 %,
 Operating,
 income,
 was,
 $,
 31.7,
 billion,
 and,
 increased,
 17,
 %,
 up,
 16,
 %,
 in,
 constant,
 currency,
 Net,
 income,
 was,
 $,
 24.1,
 billion,
 and,
 increased,
 10,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 3.23,
 and,
 increased,
 10,
 %,
 We,
 are,
 innovating,
 across,
 our,
 tech,
 stack,
 and,
 helping,
 customers,
 unlock,
 the,
 full,
 ROI,
 of,
 AI,
 to,
 capture,
 the,
 massive,
 opportunity,
 ahead,
 said,
 Satya,
 Nadella,
 chairman,
 and,
 chief,
 executive,
 officer,
 of,
 Microsoft,
 Already,
 our,
 AI,
 bu

In [13]:
count = doc.count_by(spacy.attrs.POS)
count

{103: 36,
 96: 55,
 89: 23,
 100: 40,
 97: 61,
 93: 55,
 92: 123,
 90: 12,
 85: 39,
 98: 1,
 84: 27,
 87: 11,
 99: 10,
 86: 11,
 95: 5,
 94: 1}

In [15]:
for k,v in count.items():
    print(k, ":", doc.vocab[k].text, ":", v)

103 : SPACE : 36
96 : PROPN : 55
89 : CCONJ : 23
100 : VERB : 40
97 : PUNCT : 61
93 : NUM : 55
92 : NOUN : 123
90 : DET : 12
85 : ADP : 39
98 : SCONJ : 1
84 : ADJ : 27
87 : AUX : 11
99 : SYM : 10
86 : ADV : 11
95 : PRON : 5
94 : PART : 1


## Practice

In [17]:
# read the file

with open("./datasets/news_story.txt", "r") as file:
    news_story = file.read()

news_story

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from March’s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and energy prices, so-called core CPI still rose 6.2%, against expectations for a 6% gain, clouding hopes that inflation had peaked in March.\n\nThe month-over-month gains also were higher than expectations — 0.3% on headline CPI versus the 0.2% estimate and a 0.6% increase for core, against the outlook for a 0.4% gain.\n\nThe price gains also meant that workers continued to lose ground. Real wages adjusted for inflation decreased 0.1% on the month despite a nominal increase of 0.3% in average hourl

In [18]:
doc = nlp(news_story)

- Extracting all the nouns from the text

In [20]:
# saving the nouns
nouns = []

for token in doc:
    if token.pos_ in ["NOUN", "PROPN"]:
        nouns.append(token.text)

nouns

['Inflation',
 'April',
 'climb',
 'consumers',
 'brink',
 'expansion',
 'Bureau',
 'Labor',
 'Statistics',
 'Wednesday',
 'consumer',
 'price',
 'index',
 'measure',
 'prices',
 'goods',
 'services',
 '%',
 'year',
 'Dow',
 'Jones',
 'estimate',
 '%',
 'gain',
 'ease',
 'March',
 'peak',
 'level',
 'summer',
 'food',
 'energy',
 'prices',
 'core',
 'CPI',
 '%',
 'expectations',
 '%',
 'gain',
 'hopes',
 'inflation',
 'March',
 'month',
 'month',
 'gains',
 'expectations',
 '%',
 'headline',
 'CPI',
 '%',
 'estimate',
 '%',
 'increase',
 'core',
 'outlook',
 '%',
 'gain',
 'price',
 'gains',
 'workers',
 'ground',
 'wages',
 'inflation',
 '%',
 'month',
 'increase',
 '%',
 'earnings',
 'year',
 'earnings',
 '%',
 'earnings',
 '%',
 'Inflation',
 'threat',
 'recovery',
 'Covid',
 'pandemic',
 'economy',
 'stage',
 'year',
 'growth',
 'level',
 'prices',
 'pump',
 'grocery',
 'stores',
 'problem',
 'inflation',
 'areas',
 'housing',
 'auto',
 'sales',
 'host',
 'areas',
 'Federal',
 'Res

- Extracting all the Numbers

In [22]:
# extracting all the numbers

nums = []

for token in doc:
    if token.pos_ in ["NUM"]:
        nums.append(token.text)

nums

['8.3',
 '8.1',
 '1982',
 '6.2',
 '6',
 '0.3',
 '0.2',
 '0.6',
 '0.4',
 '0.1',
 '0.3',
 '2.6',
 '5.5',
 '2021',
 '1984',
 'one',
 'two',
 'two',
 '2']