In [None]:
import spacy
from collections import Counter
from spacy import displacy

In [None]:
#1. Load the file and create a Doc object

nlp = spacy.load("en_core_web_sm")

with open("peterrabbit.txt", "r", encoding="utf-8") as f:
    text = f.read()

doc = nlp(text)
print("Doc object created!\n")

Doc object created!



In [None]:
#2. For every token in the 3rd sentence, print token text, POS, TAG, and TAG description

sentences = list(doc.sents)
third_sentence = sentences[2]
print("Third sentence:\n", third_sentence.text, "\n")

print(f"{'TOKEN':<15}{'POS':<10}{'TAG':<10}{'TAG DESCRIPTION'}")
for token in third_sentence:
    print(f"{token.text:<15}{token.pos_:<10}{token.tag_:<10}{spacy.explain(token.tag_)}")

Third sentence:
 They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.

 

TOKEN          POS       TAG       TAG DESCRIPTION
They           PRON      PRP       pronoun, personal
lived          VERB      VBD       verb, past tense
with           ADP       IN        conjunction, subordinating or preposition
their          PRON      PRP$      pronoun, possessive
Mother         PROPN     NNP       noun, proper singular
in             ADP       IN        conjunction, subordinating or preposition
a              DET       DT        determiner
sand           NOUN      NN        noun, singular or mass
-              PUNCT     HYPH      punctuation mark, hyphen
bank           NOUN      NN        noun, singular or mass
,              PUNCT     ,         punctuation mark, comma
underneath     ADP       IN        conjunction, subordinating or preposition
the            DET       DT        determiner
root           NOUN      NN        noun, singular or mass
of    

In [None]:
#3. Frequency list of POS tags for the entire document

pos_counts = Counter([token.pos_ for token in doc])
print("\nFrequency of POS tags:")
for pos, freq in pos_counts.items():
    print(f"{pos:<10}{freq}")


Frequency of POS tags:
DET       90
PROPN     75
ADP       124
PUNCT     172
NUM       8
SPACE     99
ADV       65
SCONJ     20
NOUN      173
PRON      108
VERB      131
ADJ       54
CCONJ     61
AUX       50
PART      28


In [None]:
#4. CHALLENGE: What percentage of tokens are nouns?

total_tokens = len([token for token in doc if not token.is_punct])
noun_tokens = len([token for token in doc if token.pos_ == "NOUN"])
noun_percentage = (noun_tokens / total_tokens) * 100
print(f"\nPercentage of tokens that are nouns: {noun_percentage:.2f}%")


Percentage of tokens that are nouns: 15.96%


In [None]:
#5. Display dependency parse for the 3rd sentence

print("\nDependency Parse for 3rd sentence:")
for token in third_sentence:
    print(f"{token.text:<15}{token.dep_:<15}{token.head.text:<15}{[child.text for child in token.children]}")


Dependency Parse for 3rd sentence:
They           nsubj          lived          []
lived          ROOT           lived          ['They', 'with', 'in', '.']
with           prep           lived          ['Mother']
their          poss           Mother         []
Mother         pobj           with           ['their']
in             prep           lived          ['bank']
a              det            bank           []
sand           compound       bank           []
-              punct          bank           []
bank           pobj           in             ['a', 'sand', '-', ',', 'underneath']
,              punct          bank           []
underneath     prep           bank           ['root']
the            det            root           []
root           pobj           underneath     ['the', 'of']
of             prep           root           ['tree']
a              det            tree           ['\n']

              dep            a              []
very           advmod         big       

In [None]:
#6. Show the first two named entities

print("\nFirst two named entities:")
for ent in doc.ents[:2]:
    print(ent.text, "-", ent.label_, "-", spacy.explain(ent.label_))


First two named entities:
The Tale of Peter Rabbit - WORK_OF_ART - Titles of books, songs, etc.
Beatrix Potter - PERSON - People, including fictional


In [None]:
#7. Number of sentences in the story

num_sents = len(sentences)
print(f"\nTotal number of sentences in the story: {num_sents}")


Total number of sentences in the story: 57


In [None]:
#8. CHALLENGE: How many sentences contain named entities?

sent_with_ents = [sent for sent in sentences if any(tok.ent_type_ for tok in sent)]
print(f"Number of sentences containing named entities: {len(sent_with_ents)}")

Number of sentences containing named entities: 38


In [None]:
#9. Display the named entity visualization for the first sentence with an entity

print("\nDisplaying named entity visualization for list_of_sents[0]...")
displacy.render(sent_with_ents[0], style="ent", jupyter=True)


Displaying named entity visualization for list_of_sents[0]...
