### 1. Installation Library & Download Resources

In [2]:
# Install required NLP libraries
!pip install nltk textblob spacy
# Download the small English model for spaCy
!python -m spacy download en_core_web_sm

import nltk
# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m115.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 2. Data Verification

In [3]:
import os
# Print the current working directory
print(os.getcwd())
# List all files in the current directory
print(os.listdir())

/content
['.config', 'alice29.txt', 'sample_data']


### 3. Read text

In [4]:
# Read the input text file (alice29.txt)
with open('alice29.txt', 'r', encoding='utf-8') as f:
    text = f.read()

### 4. Text Cleaning

In [5]:
import re

# Convert all characters to lowercase
text = text.lower()

# Preserve whitespace (\s), remove only non-alphabetic characters
text = re.sub(r'[^a-z\s]', ' ', text)

# Merge multiple spaces / line breaks into a single space
text = re.sub(r'\s+', ' ', text).strip()

# Save cleaned text
with open('cleaned.txt', 'w', encoding='utf-8') as f:
    f.write(text)

### 5. NLTK Tokenization + Top10

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Load English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Tokenize the cleaned text into individual words
words = word_tokenize(text)
# Remove stopwords from the token list
words = [w for w in words if w not in stop_words]

# Save all remaining tokens to a file (one word per line)
with open('words.txt', 'w') as f:
    for w in words:
        f.write(w + '\n')

# Count word frequencies
freq = Counter(words)
# Extract the top 10 most frequent words
top10 = freq.most_common(10)

# Save the Top-10 words and their frequencies
with open('top10words.txt', 'w') as f:
    for w, c in top10:
        f.write(f"{w}: {c}\n")
# Display the Top-10 words in the notebook output
top10

[('said', 462),
 ('alice', 398),
 ('little', 128),
 ('one', 104),
 ('know', 88),
 ('like', 85),
 ('would', 83),
 ('went', 83),
 ('could', 77),
 ('queen', 75)]

### 6. Performance Comparison

In [7]:
import timeit
import statistics
from textblob import TextBlob
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Number of times each tokenizer is benchmarked
RUNS = 10

# Tokenization using NLTK
def nltk_tok():
    word_tokenize(text)

# Tokenization using TextBlob
def tb_tok():
    TextBlob(text).words

# Tokenization using spaCy
def spacy_tok():
    nlp(text)

# Benchmark function to measure execution time
def bench(fn):
    """
    Measure the mean and standard deviation of execution time
    for a given tokenization function.
    """
    t = timeit.repeat(fn, number=1, repeat=RUNS)
    return statistics.mean(t), statistics.stdev(t)

# Run benchmarks for each framework
results = {
    'NLTK': bench(nltk_tok),
    'TextBlob': bench(tb_tok),
    'spaCy': bench(spacy_tok)
}

# Save performance comparison results to file
with open('time_compares.txt', 'w') as f:
    f.write("Framework\tMean(s)\tStd(s)\n")
    for k, (m, s) in results.items():
        f.write(f"{k}\t{m:.4f}\t{s:.4f}\n")

# Display benchmark results
results


{'NLTK': (0.06819694650000371, 0.005136760425334951),
 'TextBlob': (0.10858047159999984, 0.005069208229174552),
 'spaCy': (4.035808038400002, 0.42080068168133933)}