In [1]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Paragraph (3–4 sentences)
text = """The quick brown fox jumps over the lazy dog.
It’s a classic sentence used in typing practice.
However, many people don’t realize its history."""

print("=== Q2. Tokenization Assignment ===\n")
print("Original Paragraph:\n", text, "\n")

# ----------------------------------------------------------------------
# 1. Naïve space-based tokenization
naive_tokens = text.split()
print("1. Naïve space-based tokens:")
print(naive_tokens, "\n")

# Manually corrected tokens
manual_tokens = [
    "The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", ".",
    "It", "’s", "a", "classic", "sentence", "used", "in", "typing", "practice", ".",
    "However", ",", "many", "people", "don’t", "realize", "its", "history", "."
]
print("1. Manually corrected tokens:")
print(manual_tokens, "\n")

# ----------------------------------------------------------------------
# 2. Compare with spaCy tool
doc = nlp(text)
tool_tokens = [token.text for token in doc]
print("2. spaCy tool tokens:")
print(tool_tokens, "\n")

# Show differences clearly
print("2. Differences (manual vs spaCy):")
i, j = 0, 0
while i < len(manual_tokens) and j < len(tool_tokens):
    if manual_tokens[i] == tool_tokens[j]:
        i += 1
        j += 1
    else:
        print(f"  Manual: {manual_tokens[i]}   |   spaCy: {tool_tokens[j]}")
        # advance both, because it's a misalignment
        i += 1
        j += 1
# If extra tokens remain
if i < len(manual_tokens):
    print("  Extra manual tokens:", manual_tokens[i:])
if j < len(tool_tokens):
    print("  Extra spaCy tokens:", tool_tokens[j:])
print()

# ----------------------------------------------------------------------
# 3. Multiword Expressions (MWEs)
MWEs = [
    "New York City",   # place name
    "kick the bucket", # idiom
    "high school"      # fixed phrase
]
print("3. Multiword Expressions (MWEs):")
for expr in MWEs:
    print(f"- '{expr}' should be treated as one token (meaning is lost if split).")
print()

# ----------------------------------------------------------------------
# 4. Reflection
reflection = """
4. Reflection:
The hardest part of tokenization in English was handling contractions like "don’t" or "it’s",
since they may be split differently depending on the tool. Punctuation also introduces
challenges, because naïve space-based methods attach punctuation marks to words. Compared
to English, morphologically rich languages (like Turkish or Hindi) are harder, since suffixes
and word forms require morphological analysis. Multiword expressions add another difficulty
because their meaning disappears when split. Tools like spaCy generally perform well,
but sometimes make tokenization choices that differ from manual expectations.
"""
print(reflection)

=== Q2. Tokenization Assignment ===

Original Paragraph:
 The quick brown fox jumps over the lazy dog.
It’s a classic sentence used in typing practice.
However, many people don’t realize its history. 

1. Naïve space-based tokens:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.', 'It’s', 'a', 'classic', 'sentence', 'used', 'in', 'typing', 'practice.', 'However,', 'many', 'people', 'don’t', 'realize', 'its', 'history.'] 

1. Manually corrected tokens:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'It', '’s', 'a', 'classic', 'sentence', 'used', 'in', 'typing', 'practice', '.', 'However', ',', 'many', 'people', 'don’t', 'realize', 'its', 'history', '.'] 

2. spaCy tool tokens:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '\n', 'It', '’s', 'a', 'classic', 'sentence', 'used', 'in', 'typing', 'practice', '.', '\n', 'However', ',', 'many', 'people', 'do', 'n’t', 'realize', 'its', 'history', '.'] 

2. Differe