In [73]:
import spacy
from collections import Counter
from spacy import displacy

In [8]:
nlp=spacy.load("en_core_web_sm")

In [46]:
with open("peterrabbit.txt", "r") as file:
    text = file.read()

In [53]:
doc=nlp(text)

In [101]:
sentence=list(doc.sents)
third_sentence=sentence[2]

In [59]:
third_sentence

They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.


In [65]:
for token in third_sentence:
    print(f"Token: {token.text}\t POS: {token.pos_}\t TAG: {token.tag_}\t Description: {spacy.explain(token.tag_)}")

Token: They	 POS: PRON	 TAG: PRP	 Description: pronoun, personal
Token: lived	 POS: VERB	 TAG: VBD	 Description: verb, past tense
Token: with	 POS: ADP	 TAG: IN	 Description: conjunction, subordinating or preposition
Token: their	 POS: PRON	 TAG: PRP$	 Description: pronoun, possessive
Token: Mother	 POS: NOUN	 TAG: NN	 Description: noun, singular or mass
Token: in	 POS: ADP	 TAG: IN	 Description: conjunction, subordinating or preposition
Token: a	 POS: DET	 TAG: DT	 Description: determiner
Token: sand	 POS: NOUN	 TAG: NN	 Description: noun, singular or mass
Token: -	 POS: PUNCT	 TAG: HYPH	 Description: punctuation mark, hyphen
Token: bank	 POS: NOUN	 TAG: NN	 Description: noun, singular or mass
Token: ,	 POS: PUNCT	 TAG: ,	 Description: punctuation mark, comma
Token: underneath	 POS: ADP	 TAG: IN	 Description: conjunction, subordinating or preposition
Token: the	 POS: DET	 TAG: DT	 Description: determiner
Token: root	 POS: NOUN	 TAG: NN	 Description: noun, singular or mass
Token: of	 P

In [69]:
pos_counts = Counter(token.pos_ for token in doc)
for pos, count in pos_counts.items():
    print(f"POS: {pos}, Count: {count}")

POS: DET, Count: 90
POS: PROPN, Count: 74
POS: ADP, Count: 125
POS: PUNCT, Count: 171
POS: NUM, Count: 9
POS: SPACE, Count: 99
POS: ADV, Count: 63
POS: SCONJ, Count: 19
POS: NOUN, Count: 172
POS: PRON, Count: 110
POS: VERB, Count: 135
POS: ADJ, Count: 53
POS: CCONJ, Count: 61
POS: AUX, Count: 49
POS: PART, Count: 28


In [71]:
total_tokens = 0
noun_tokens = 0

for token in doc:
    total_tokens += 1
    if token.pos_ in ["NOUN", "PROPN"]:
        noun_tokens += 1

noun_percentage = (noun_tokens / total_tokens) * 100 if total_tokens > 0 else 0

print(f"Percentage of nouns: {noun_percentage:.2f}%")

Percentage of nouns: 19.55%


In [75]:
print("Dependency Parse (Text Format):")
for token in third_sentence:
    print(f"Token: {token.text}, Head: {token.head.text}, Dependency: {token.dep_}")

# Display the dependency parse in graphical format
print("\nDependency Parse (Graphical Format):")
displacy.render(third_sentence, style="dep", jupyter=True)

Dependency Parse (Text Format):
Token: They, Head: lived, Dependency: nsubj
Token: lived, Head: lived, Dependency: ROOT
Token: with, Head: lived, Dependency: prep
Token: their, Head: Mother, Dependency: poss
Token: Mother, Head: with, Dependency: pobj
Token: in, Head: lived, Dependency: prep
Token: a, Head: bank, Dependency: det
Token: sand, Head: bank, Dependency: compound
Token: -, Head: bank, Dependency: punct
Token: bank, Head: in, Dependency: pobj
Token: ,, Head: lived, Dependency: punct
Token: underneath, Head: lived, Dependency: prep
Token: the, Head: root, Dependency: det
Token: root, Head: underneath, Dependency: pobj
Token: of, Head: root, Dependency: prep
Token: a, Head: tree, Dependency: det
Token: 
, Head: a, Dependency: dep
Token: very, Head: big, Dependency: advmod
Token: big, Head: tree, Dependency: amod
Token: fir, Head: tree, Dependency: compound
Token: -, Head: tree, Dependency: punct
Token: tree, Head: of, Dependency: pobj
Token: ., Head: lived, Dependency: punct
To

In [77]:
named_entities = []

# Iterate over the entities in the document and add to the list
for ent in doc.ents:
    named_entities.append((ent.text, ent.label_))
    # Break after we collect the first two named entities
    if len(named_entities) == 2:
        break

# Print the first two named entities
for i, (text, label) in enumerate(named_entities, start=1):
    print(f"Entity {i}: Text = '{text}', Label = '{label}'")

Entity 1: Text = 'The Tale of Peter Rabbit', Label = 'WORK_OF_ART'
Entity 2: Text = 'Beatrix Potter', Label = 'PERSON'


In [85]:
num_sentences=len(sentence)
print(f"The Tale of Peter Rabbit contains {num_sentences} sentences.")

The Tale of Peter Rabbit contains 55 sentences.


In [87]:
sentences_with_entities = 0

for sentence in doc.sents:
    if any(ent for ent in sentence.ents):
        sentences_with_entities += 1

print(f"Number of sentences containing named entities: {sentences_with_entities}")

Number of sentences containing named entities: 35


In [103]:
list_of_sents = list(doc.sents)
displacy.render(list_of_sents[0], style="ent", jupyter=True)