In [1]:
import sys
print(sys.version)

3.11.14 (main, Oct 31 2025, 23:04:14) [Clang 21.1.4 ]


In [2]:
%run -i "../util/file_utils.ipynb"
%run -i "../util/lang_utils.ipynb"

In [3]:
text = "I have five birds"
doc = small_model(text)

In [4]:
for token in doc:
    if (token.pos_ == "NOUN" and
        token.lemma_ != token.text):
        print(f"{token.text}: plural")

birds: plural


In [5]:
print(doc[3].morph.get("Number"))  # Output: ['Plur']

['Plur']


In [6]:
# Using Enum class to define custom labels
from enum import Enum
class Noun_number(Enum):
    SINGULAR = 1
    PLURAL = 2

In [7]:
# Define a function to determine singular or plural nouns
def get_nouns_number(text, model, method="lemma"):
    nouns = []
    doc = model(text)
    for token in doc:
        if (token.pos_ == "NOUN"):
            if method == "lemma":
                if token.lemma_ != token.text:
                    nouns.append((token.text, Noun_number.PLURAL))
                else:
                    nouns.append((token.text, Noun_number.SINGULAR))
            elif method == "morph":
                number = token.morph.get("Number")
                if "Plur" in number:
                    nouns.append((token.text, Noun_number.PLURAL))
                else:
                    nouns.append((token.text, Noun_number.SINGULAR))
    return nouns

In [8]:
text = "Three geese crossed the road, but the two deer stayed."
nouns = get_nouns_number(text, small_model, method="lemma")
print(nouns)
nouns = get_nouns_number(text, small_model, method="morph")
print(nouns)

[('geese', <Noun_number.PLURAL: 2>), ('road', <Noun_number.SINGULAR: 1>), ('deer', <Noun_number.SINGULAR: 1>)]
[('geese', <Noun_number.PLURAL: 2>), ('road', <Noun_number.SINGULAR: 1>), ('deer', <Noun_number.SINGULAR: 1>)]


In [9]:
text = "Three geese crossed the road, but the two deer stayed."
nouns = get_nouns_number(text, large_model, method="lemma")
print(nouns)
nouns = get_nouns_number(text, large_model, method="morph")
print(nouns)

[('geese', <Noun_number.PLURAL: 2>), ('road', <Noun_number.SINGULAR: 1>), ('deer', <Noun_number.SINGULAR: 1>)]
[('geese', <Noun_number.PLURAL: 2>), ('road', <Noun_number.SINGULAR: 1>), ('deer', <Noun_number.SINGULAR: 1>)]


In [10]:
# GPT 5
from openai import OpenAI
client = OpenAI()
prompt = """Decide whether the noun in the sentence is singular or plural.
Return the list in the format of a Python tuple: (word, number), where number is either 'singular' or 'plural'.
Do not provide any explanation.
Sentence: "Three geese crossed the road, but the two deer stayed."""

response = client.chat.completions.create(
    model="gpt-5.1",
    messages=[
        {"role": "system", "content": "You are a NLP expert."},
        {"role": "user", "content": prompt}
    ]
)
print(response.choices[0].message.content)

[('geese', 'plural'), ('road', 'singular'), ('deer', 'plural')]


In [11]:
from textblob import TextBlob
texts = ["book", "goose", "pen", "point", "deer", "child"]
blob_objs = [TextBlob(text) for text in texts]
plurals = [blob.words[0].pluralize() for blob in blob_objs]
print(plurals)

blob_objs = [TextBlob(text) for text in plurals]
singulars = [blob.words[0].singularize() for blob in blob_objs]
print(singulars)

['books', 'geese', 'pens', 'points', 'deer', 'children']
['book', 'goose', 'pen', 'point', 'deer', 'child']


In [12]:
# Dependency parsing
sentence = "I have seldom heard him mention her under any other name."

def print_dependencies(sentence, model):
    doc = model(sentence)
    for token in doc:
        print(f"{token.text:10} {token.dep_:10} {spacy.explain(token.dep_)}")

In [13]:
print_dependencies(sentence, small_model)

I          nsubj      nominal subject
have       aux        auxiliary
seldom     advmod     adverbial modifier
heard      ROOT       root
him        nsubj      nominal subject
mention    ccomp      clausal complement
her        dobj       direct object
under      prep       prepositional modifier
any        det        determiner
other      amod       adjectival modifier
name       pobj       object of preposition
.          punct      punctuation


In [14]:
def print_noun_chunks(text, model):
    doc = model(text)
    for chunk in doc.noun_chunks:
        print(chunk.text)
        
sherlock1 = read_text_file("../data/sherlock_holmes_1.txt")
print_noun_chunks(sherlock1[:50], small_model)

Sherlock Holmes
she
the_ woman
I


In [15]:
def explore_properties(sentence, model):
    doc = model(sentence)
    other_span = "emotions"
    other_doc = model(other_span)
    for noun_chunk in doc.noun_chunks:
        print(f"Noun chunk: {noun_chunk.text}")
        print(f"Noun chuck start and end: {noun_chunk.start}, {noun_chunk.end}")
        print(f"Noun chunk sentence: {noun_chunk.sent}")
        print(f"Noun chunk root text: {noun_chunk.root.text}")
        print(f"Similarity to '{other_span}': {noun_chunk.similarity(other_doc)}")
    print(f"\nSentence similarity to '{other_span}': {doc.similarity(other_doc)}")

In [16]:
sentence = "The study of human emotions is a fascinating field."
explore_properties(sentence, small_model)

Noun chunk: The study
Noun chuck start and end: 0, 2
Noun chunk sentence: The study of human emotions is a fascinating field.
Noun chunk root text: study
Similarity to 'emotions': 0.07203289866447449
Noun chunk: human emotions
Noun chuck start and end: 3, 5
Noun chunk sentence: The study of human emotions is a fascinating field.
Noun chunk root text: emotions
Similarity to 'emotions': 0.28141409158706665
Noun chunk: a fascinating field
Noun chuck start and end: 6, 9
Noun chunk sentence: The study of human emotions is a fascinating field.
Noun chunk root text: field
Similarity to 'emotions': 0.10149102658033371

Sentence similarity to 'emotions': 0.23275980353355408


  print(f"Similarity to '{other_span}': {noun_chunk.similarity(other_doc)}")
  print(f"\nSentence similarity to '{other_span}': {doc.similarity(other_doc)}")


In [17]:
sentence = "The study of human psychology is a fascinating field."
explore_properties(sentence, large_model)

Noun chunk: The study
Noun chuck start and end: 0, 2
Noun chunk sentence: The study of human psychology is a fascinating field.
Noun chunk root text: study
Similarity to 'emotions': 0.29545721411705017
Noun chunk: human psychology
Noun chuck start and end: 3, 5
Noun chunk sentence: The study of human psychology is a fascinating field.
Noun chunk root text: psychology
Similarity to 'emotions': 0.5463806390762329
Noun chunk: a fascinating field
Noun chuck start and end: 6, 9
Noun chunk sentence: The study of human psychology is a fascinating field.
Noun chunk root text: field
Similarity to 'emotions': 0.34547051787376404

Sentence similarity to 'emotions': 0.4687766432762146


In [18]:
def get_subject_phrase(doc):
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
        
def get_object_phrase(doc):
    for token in doc:
        if ("dobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
        
sentence = "Laura gives Sam a very interesting book by the seashore."
doc = small_model(sentence)
subject_phrase = get_subject_phrase(doc)
object_phrase = get_object_phrase(doc)
print(f"Subject phrase: {subject_phrase}")
print(f"Object phrase: {object_phrase}")

Subject phrase: Laura
Object phrase: a very interesting book by the seashore


In [None]:
def get_dative_phrase(doc):
    for token in doc:
        if ("dative" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

def get_prepositonal_phrase(doc):
    prep_span = []
    for token in doc:
        if ("pobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            prep_span.append(doc[start:end])
    return prep_span

sentence = "Laura gives Sam a very interesting book by the seashore."
doc = small_model(sentence)
dative_phrase = get_dative_phrase(doc)
prepositional_phrases = get_prepositional_phrase(doc)
print(f"Dative phrase: {dative_phrase}")
print(f"Prepositional phrases: {prepositional_phrases}")

SyntaxError: invalid syntax (1235593585.py, line 9)