### 1. Installation Library & Download Resources

In [1]:
!pip install nltk textblob spacy
!python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 2. Data Verification

In [3]:
import os
print(os.getcwd())
print(os.listdir())

/content
['.config', 'alice29.txt', 'sample_data']


### 3. Read text

In [4]:
with open('alice29.txt', 'r', encoding='utf-8') as f:
    text = f.read()

### 4. Text Cleaning

In [5]:
import re

# 统一小写
text = text.lower()

# ❗ 保留空格（\s），只移除非字母字符
text = re.sub(r'[^a-z\s]', ' ', text)

# 把多个空格 / 换行 合并成一个空格
text = re.sub(r'\s+', ' ', text).strip()

# 保存清洗文本
with open('cleaned.txt', 'w', encoding='utf-8') as f:
    f.write(text)

### 5. NLTK Tokenization + Top10

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

stop_words = set(stopwords.words('english'))

words = word_tokenize(text)
words = [w for w in words if w not in stop_words]

with open('words.txt', 'w') as f:
    for w in words:
        f.write(w + '\n')

freq = Counter(words)
top10 = freq.most_common(10)

with open('top10words.txt', 'w') as f:
    for w, c in top10:
        f.write(f"{w}: {c}\n")

top10

[('said', 462),
 ('alice', 398),
 ('little', 128),
 ('one', 104),
 ('know', 88),
 ('like', 85),
 ('would', 83),
 ('went', 83),
 ('could', 77),
 ('queen', 75)]

### 6. Performance Comparison

In [7]:
import timeit, statistics
from textblob import TextBlob
import spacy

nlp = spacy.load("en_core_web_sm")
RUNS = 10

def nltk_tok():
    word_tokenize(text)

def tb_tok():
    TextBlob(text).words

def spacy_tok():
    nlp(text)

def bench(fn):
    t = timeit.repeat(fn, number=1, repeat=RUNS)
    return statistics.mean(t), statistics.stdev(t)

results = {
    'NLTK': bench(nltk_tok),
    'TextBlob': bench(tb_tok),
    'spaCy': bench(spacy_tok)
}

with open('time_compares.txt', 'w') as f:
    f.write("Framework\tMean(s)\tStd(s)\n")
    for k, (m, s) in results.items():
        f.write(f"{k}\t{m:.4f}\t{s:.4f}\n")

results

{'NLTK': (0.0710668963000046, 0.006025832143783411),
 'TextBlob': (0.11463498740000375, 0.005038292848188525),
 'spaCy': (4.745832831000001, 0.6288663881680195)}