In [1]:
# 分词
import nltk

text = "Hello, world! This is a test sentence for tokenization."
# 先确保所需的 punkt 资源已安装，避免在第一次调用时抛出 LookupError
for resource in ('punkt', 'punkt_tab'):
    try:
        nltk.data.find(f'tokenizers/{resource}')
    except LookupError:
        print(f'NLTK resource {resource} not found. Downloading...')
        nltk.download(resource, quiet=True)

print("Word Tokenization:")
word_tokens = nltk.word_tokenize(text)
print(word_tokens)

Word Tokenization:
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', 'for', 'tokenization', '.']


In [2]:
# 现在进行分词，并捕获任何剩余的 LookupError 以便给出明确提示
try:
    tokens = nltk.word_tokenize(text)
    print('tokens =', tokens)
except LookupError as e:
    print('Failed to tokenize: missing NLTK resource. Please run nltk.download("punkt") or nltk.download("punkt_tab") manually.')
    raise

tokens = ['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', 'for', 'tokenization', '.']


In [4]:
# 中文分词示例（使用 jieba）
import jieba
test1 = "在一个遥远的城市里，清晨的阳光穿过高楼之间的缝隙，洒在尚未醒来的街道上。"
# 假设 test1 已在 notebook 中定义
# test1 = "在一个遥远的城市里，清晨的阳光穿过高楼之间的缝隙，洒在尚未醒来的街道上。"
# 精确模式，返回 list
tokens = jieba.lcut(test1)
print('tokens =', tokens)
print('词数 =', len(tokens))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.375 seconds.
Prefix dict has been built successfully.


tokens = ['在', '一个', '遥远', '的', '城市', '里', '，', '清晨', '的', '阳光', '穿过', '高楼', '之间', '的', '缝隙', '，', '洒', '在', '尚未', '醒来', '的', '街道', '上', '。']
词数 = 24


In [3]:
# 安装缺失的第三方包（在 notebook 中使用魔法命令）
%pip install jieba --quiet

import re
import jieba
import nltk

import random
import string
# 生成随机中英文混合文本的辅助函数（无需额外依赖）
def gen_mixed_text(min_len=20, max_len=80):
    chinese_chars = '你好世界天气人工智能自然语言处理测试示例随机文本中文字符'
    result = []
    length = random.randint(min_len, max_len)
    for _ in range(length):
        if random.random() < 0.5:
            # 选择一个中文字符或短词（简单处理）
            result.append(random.choice(chinese_chars))
        else:
            # 生成一个英文单词或字符片段
            word_len = random.randint(1, 8)
            word = ''.join(random.choice(string.ascii_lowercase) for _ in range(word_len))
            result.append(word)
    # 用空格分隔英文片段，中文字符直接相连，返回字符串
    return ' '.join([r for r in result if all(ord(c) < 128 for c in r)]) + ''.join([r for r in result if not all(ord(c) < 128 for c in r)])

# 生成示例文本并赋值给 s
s = gen_mixed_text()
segments = re.split(r'(\w+)', s)  # 简单将英文字母/数字分段（保留），中文段落留在其它项
tokens = []
for seg in segments:
    if re.fullmatch(r'\w+', seg):  # 英文/数字片段
        tokens.extend(nltk.word_tokenize(seg))
    else:  # 中文或其它字符
        tokens.extend(jieba.lcut(seg))
print(tokens)

Note: you may need to restart the kernel to use updated packages.
['fc', ' ', 'dkyiufku', ' ', 'yakdiqf', ' ', 'mspjdw', ' ', 'crqesn', ' ', 'toxnzc', ' ', 'veb', ' ', 'yllytes', ' ', 'ri', ' ', 'utxh', ' ', 'ikfgc', ' ', 'u', ' ', 'dopk', ' ', 'zmlt', ' ', 'pmvxfvf', ' ', 'cynuw', ' ', 'vaydf', ' ', 'ezi', ' ', 'd', ' ', 'dtmz', ' ', 'ckqg', ' ', 'ntgdjqd', ' ', 'yxhvedpm', ' ', 'yqwdbxmt', ' ', 'ht', ' ', 'jtz', ' ', 'bd', ' ', 'nxekzzty你中自语本能中例测机智天自气人文例天界字试例理文世天界语']


In [5]:
from nltk.tokenize import word_tokenize
example_text = "This is an example sentence for tokenization."
tokens = word_tokenize(example_text)
print(tokens)

['This', 'is', 'an', 'example', 'sentence', 'for', 'tokenization', '.']
