In [None]:
import sys

sys.path.insert(0, "../util")
from file_utils import read_text_file

In [2]:
sherlock_holmes_part_of_text = read_text_file("../data/sherlock_holmes_1.txt")

In [3]:
print(sherlock_holmes_part_of_text[:500])  # Print the first 500 characters

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He


In [None]:
# Break text into sentences using NLTK's pre-trained Punkt tokenizer
import nltk.data

tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentences_nltk = tokenizer.tokenize(sherlock_holmes_part_of_text)

In [5]:
print(sentences_nltk[:2])
print(len(sentences_nltk))

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him\nmention her under any other name.']
11


In [None]:
# Sentence splitting using spaCy
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sherlock_holmes_part_of_text)
sentences_spacy = [sent.text for sent in doc.sents]
print(sentences_spacy[:2])
print(len(sentences_spacy))

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him\nmention her under any other name.']
11


In [None]:
import time


def split_into_sentences_nltk(text):
    sentences = tokenizer.tokenize(text)
    return sentences


def split_into_sentences_spacy(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences


start = time.time()
nltk_sentences = split_into_sentences_nltk(sherlock_holmes_part_of_text)
end = time.time()
print(f"NLTK sentence splitting took {end - start:.4f} seconds.")

start = time.time()
spacy_sentences = split_into_sentences_spacy(sherlock_holmes_part_of_text)
end = time.time()
print(f"spaCy sentence splitting took {end - start:.4f} seconds.")

NLTK sentence splitting took 0.0005 seconds.
spaCy sentence splitting took 0.0898 seconds.


In [8]:
# Word tokenization using NLTK
words_nltk = nltk.tokenize.word_tokenize(sherlock_holmes_part_of_text)
print(words_nltk[:20])
print(f"Number of words based on NLTK: {len(words_nltk)}")

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_the_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name']
Number of words based on NLTK: 230


In [None]:
# NLTK's MWE (Multi-Word Expression) tokenizer
from nltk.tokenize import MWETokenizer

mwe_tokenizer = MWETokenizer([("Sherlock", "Holmes")])
mwe_tokenizer.add_mwe(("any", "other", "name"))
print(mwe_tokenizer.tokenize(words_nltk[:20]))
print(
    f"Number of words based on NLTK MWE Tokenizer: {len(mwe_tokenizer.tokenize(words_nltk))}"
)

['To', 'Sherlock_Holmes', 'she', 'is', 'always', '_the_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any_other_name']
Number of words based on NLTK MWE Tokenizer: 227


In [None]:
# Word tokenization using spaCy (reusing nlp model loaded earlier)
doc = nlp(sherlock_holmes_part_of_text)
words_spacy = [token.text for token in doc]
print(words_spacy[:20])
print(
    f"Number of words based on spaCy: {len(words_spacy)}"
)  # different from NLTK due to different tokenization rules

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_', 'the', '_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', '\n', 'mention', 'her', 'under']
Number of words based on spaCy: 251


In [11]:
# Difference between NLTK and spaCy tokenization
print(set(words_spacy) - set(words_nltk))
print(set(words_nltk) - set(words_spacy))

{'excellent', '—', 'high', 'power', '’s', 'observer', '_', '-', '\n'}
{'_the_', '’', 'high-power', 's', 'observer—excellent'}


In [12]:
# POS tagging with spaCy
def pos_tag_spacy(text, model):
    doc = model(text)
    words = [token.text for token in doc]
    pos = [token.pos_ for token in doc]
    return list(zip(words, pos))

In [13]:
words_with_pos = pos_tag_spacy(sherlock_holmes_part_of_text, nlp)
print(words_with_pos[:5])

[('To', 'ADP'), ('Sherlock', 'PROPN'), ('Holmes', 'PROPN'), ('she', 'PRON'), ('is', 'AUX')]


In [None]:
# POS tagging with NLTK
def word_tokenize_nltk(text):
    return nltk.tokenize.word_tokenize(text)


def pos_tag_nltk(text):
    words = word_tokenize_nltk(text)
    return nltk.pos_tag(words)


words_with_pos_nltk = pos_tag_nltk(sherlock_holmes_part_of_text)
print(words_with_pos_nltk[:5])

[('To', 'TO'), ('Sherlock', 'NNP'), ('Holmes', 'NNP'), ('she', 'PRP'), ('is', 'VBZ')]


In [None]:
nltk.download("tagsets")
nltk.help.upenn_tagset("TO")

TO: "to" as preposition or infinitive marker
    to


[nltk_data] Downloading package tagsets to /mnt/ebs1/yluo/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [None]:
# Experiment with OpenAI's API
from dotenv import load_dotenv

load_dotenv()

from openai import OpenAI

client = OpenAI()

response = client.responses.create(
    model="gpt-5", input="Write a one-sentence joke about NLP."
)
print(response.output_text)

I told my NLP model to “drop by,” and it deleted the preposition.


In [17]:
prompt = """Decide what the part of speech tags are for a sentence.
Preserve original capitalizaion.
Return the list in the format of a python tuple: (word, part of speech tag).
Sentence: In his eyes she eclipses and predominates the whole of her sex."""

In [None]:
# GPT-3.5-Turbo call
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    messages=[
        {
            "role": "system",
            "content": "You are a NLP expert who is also proficient in Python programming.",
        },
        {"role": "user", "content": prompt},
    ],
)
print(response)

ChatCompletion(id='chatcmpl-Cdq4KAEWCcGgA9rtW0lk5cHUxIOlD', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are the part of speech tags for the sentence "In his eyes she eclipses and predominates the whole of her sex":\n\n```python\n[(\'In\', \'IN\'), (\'his\', \'PRP$\'), (\'eyes\', \'NNS\'), (\'she\', \'PRP\'), (\'eclipses\', \'VBZ\'), (\'and\', \'CC\'), (\'predominates\', \'VBZ\'), (\'the\', \'DT\'), (\'whole\', \'JJ\'), (\'of\', \'IN\'), (\'her\', \'PRP$\'), (\'sex\', \'NN\')]\n```\n\nIn the above list, each tuple contains a word from the sentence along with its corresponding part of speech tag.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1763610980, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=139, prompt_tokens=81, total_tokens=220, completion_tokens_details=Comp

In [19]:
print(response.choices[0].message.content)

Here are the part of speech tags for the sentence "In his eyes she eclipses and predominates the whole of her sex":

```python
[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')]
```

In the above list, each tuple contains a word from the sentence along with its corresponding part of speech tag.


In [None]:
# GPT-4o call
response = client.chat.completions.create(
    model="gpt-4o",  # gpt-5 doesn't exist; use gpt-4o (latest)
    messages=[
        {
            "role": "system",
            "content": "You are an NLP expert who is also proficient in Python programming.",
        },
        {"role": "user", "content": prompt},
    ],
    max_tokens=256,
)

print(response.choices[0].message.content)

To determine the part of speech tags for each word in the given sentence, we can use a standardized tag set, such as the Penn Treebank POS tags. Below is the list:

```python
[
    ("In", "IN"),          # Preposition or subordinating conjunction
    ("his", "PRP$"),       # Possessive pronoun
    ("eyes", "NNS"),       # Noun, plural
    ("she", "PRP"),        # Personal pronoun
    ("eclipses", "VBZ"),   # Verb, third person singular present
    ("and", "CC"),         # Coordinating conjunction
    ("predominates", "VBZ"),  # Verb, third person singular present
    ("the", "DT"),         # Determiner
    ("whole", "JJ"),       # Adjective
    ("of", "IN"),          # Preposition or subordinating conjunction
    ("her", "PRP$"),       # Possessive pronoun
    ("sex", "NN"),         # Noun, singular or mass
    (".", ".")             # Punctuation
]
```

This is a simple interpretation based on common part of speech tags. Different tag


In [None]:
# GPT-5.1 call
response = client.chat.completions.create(
    model="gpt-5.1",
    messages=[
        {
            "role": "system",
            "content": "You are an NLP expert who is also proficient in Python programming.",
        },
        {"role": "user", "content": prompt},
    ],
    max_completion_tokens=256,  # Changed from max_tokens
)

print(response.choices[0].message.content)

# response is the full API response object containing metadata and the generated content
# .choices[0] is the first list of generated completions (if you set n>0, there are n choices)
# .message is the assistant's reply message
# .content is the generated tuple of words and POS tags

("In", "IN"),
("his", "PRP$"),
("eyes", "NNS"),
("she", "PRP"),
("eclipses", "VBZ"),
("and", "CC"),
("predominates", "VBZ"),
("the", "DT"),
("whole", "JJ"),
("of", "IN"),
("her", "PRP$"),
("sex", "NN"),
(".", ".")


In [None]:
# Compare differnt model outputs
from ast import literal_eval


def pos_tag_gpt(text, client):
    prompt = f"""Decide what the part of speech tags are for a sentence.
Preserve original capitalizaion.
Return the list in the format of a python tuple: (word, part of speech tag).
Do not include other explanations.
Sentence: {text}"""
    response = client.chat.completions.create(
        model="gpt-5.1",
        messages=[
            {
                "role": "system",
                "content": "You are an NLP expert who is also proficient in Python programming.",
            },
            {"role": "user", "content": prompt},
        ],
        max_completion_tokens=256,
    )
    result = response.choices[0].message.content
    result = result.replace("\n", " ")  # Remove newlines
    result = list(literal_eval(result))  # Safely evaluate string to list
    return result

In [23]:
start = time.time()
first_sentence = "In his eyes she eclipses and predominates the whole of her sex."
words_with_pos = pos_tag_gpt(first_sentence, client)
print(words_with_pos)
print(f"GPT-5.1 POS tagging took {time.time() - start:.4f} seconds.")

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.')]
GPT-5.1 POS tagging took 2.3951 seconds.


In [24]:
# Lemmatization

words = ["running", "jumps", "easily", "fair", "fairly", "wolves", "better"]
doc = nlp(" ".join(words))
for token in doc:
    print(f"{token.text}: {token.lemma_}")

running: running
jumps: jump
easily: easily
fair: fair
fairly: fairly
wolves: wolf
better: well


In [25]:
doc = nlp(sherlock_holmes_part_of_text)
for token in doc[:20]:
    print(f"{token.text}: {token.lemma_}")

To: to
Sherlock: Sherlock
Holmes: Holmes
she: she
is: be
always: always
_: _
the: the
_: _
woman: woman
.: .
I: I
have: have
seldom: seldom
heard: hear
him: he

: 

mention: mention
her: she
under: under


In [29]:
lemmatizer = None
for name, proc in nlp.pipeline:
    if name == "lemmatizer":
        lemmatizer = proc

for token in doc[:10]:
    print(f"{token} is in its base form: {lemmatizer.is_base_form(token)}")

To is in its base form: False
Sherlock is in its base form: False
Holmes is in its base form: False
she is in its base form: False
is is in its base form: False
always is in its base form: False
_ is in its base form: False
the is in its base form: False
_ is in its base form: False
woman is in its base form: True


In [30]:
# Using NLTK to remove stop words
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/ebs1/yluo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
print(stopwords.words("english")[:10])  # Print first 10 stop words

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [2]:
words_nltk = nltk.tokenize.word_tokenize(sherlock_holmes_part_of_text)
print(len(words_nltk))

NameError: name 'nltk' is not defined

In [1]:
words_ex_stopwords_nltk = [word for word in words_nltk if word.lower() not in stopwords.words("english")]
print(len(words_ex_stopwords_nltk))

NameError: name 'words_nltk' is not defined