定义一个文本预处理的函数，包含以下5个步骤：
1. Loading text（读取文本文件）
2. Tokenization（分词）
3. Text normalization and lemmatization（词形归一化）
4. Removing stopwords（删除停用词）
5. Output token sequence（输出文本文件）

In [1]:
import nltk, jieba # Load packages

In [2]:
def text_preprocessing(inputpath, outputpath):
    # Loading text
    with open(inputpath, 'r') as f:
        doc = ''.join([line for line in f]).replace('\n', ' ')
    # Tokenization
    tokens = nltk.tokenize.word_tokenize(doc)
    # Text normalization and lemmatization
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token).lower() for token in tokens]
    # Removing stopwords
    stopwords_english = nltk.corpus.stopwords.words('english')
    with open('stop_words') as f:
        stopwords_chinese = [line.strip() for line in f]
    stopwords = stopwords_chinese + stopwords_english + ['``', "''"]
    tokens = [token for token in tokens if token not in stopwords]
    # Output token sequence
    with open(outputpath, 'w') as f:
        for token in tokens:
            f.write(token + '\n')
    return tokens

In [3]:
text_preprocessing('corpus/Baidu sues Apple.txt', 'Tokens of Baidu sues Apple.txt')

['baidu',
 'ha',
 'sued',
 'apple',
 'app',
 'maker',
 "'s",
 'fake',
 'copy',
 'chinese',
 'web',
 'giant',
 "'s",
 'ernie',
 'ai',
 'chatbot',
 'appeared',
 'igiant',
 "'s",
 'app',
 'store',
 'unaware',
 'ernie',
 'baidu',
 "'s",
 'answer',
 'west',
 "'s",
 'chatgpt',
 "'s",
 'limited',
 'preview',
 'test',
 'account',
 'baidu',
 'official',
 'release',
 'chatbot',
 'baidu',
 'told',
 'register',
 'monday',
 'filed',
 'lawsuit',
 'alleged',
 'maker',
 'ernie-powered',
 'apps',
 'haidian',
 'district',
 'people',
 "'s",
 'court',
 'beijing',
 'apple',
 "'re",
 'informed',
 'allowed',
 'software',
 'shelf',
 'official',
 'apps',
 'ernie',
 'bot',
 'ernie',
 'bot',
 'app',
 'found',
 'app',
 'store',
 'application',
 'store',
 'fake',
 'baidu',
 "'s",
 'spokespeople',
 'told',
 'el',
 'reg',
 'late',
 'march',
 'baidu',
 'weibo',
 'complain',
 'website',
 'internet',
 'community',
 'selling',
 'access',
 'test',
 'version',
 'ernie',
 'profit',
 'search',
 'giant',
 'affect',
 'normal'

In [4]:
text_preprocessing('corpus/Canadian super pigs.txt', 'Tokens of Canadian super pigs.txt')

['invasive',
 'specie',
 'crossbred',
 'super',
 'pig',
 'canada',
 'eventually',
 'issue',
 'united',
 'states',
 'expert',
 'continuously',
 'sounding',
 'alarm',
 'monitoring',
 'situation',
 'wild',
 'pig',
 'plagued',
 'southern',
 'state',
 'u.s.',
 'decade',
 'smithsonian',
 'magazine',
 'animal',
 'destroying',
 'crop',
 'killing',
 'native',
 'specie',
 'walking',
 'petri',
 'dish',
 'disease',
 'spread',
 'human',
 'pig',
 'reportedly',
 'stick',
 'warmer',
 'climate',
 'florida',
 'texas',
 'managed',
 'estimated',
 '2.1',
 'billion',
 'damage',
 'annually',
 'texas',
 'parks',
 'wildlife',
 'expert',
 'breed',
 'wild',
 'cross-bred',
 'super',
 'pig',
 'threaten',
 'northern',
 'u.s.',
 'state',
 'woe',
 'super',
 'pig',
 'reportedly',
 'crossbreed',
 'domesticated',
 'wild',
 'swine',
 'larger',
 'hairier',
 'smarter',
 'boast',
 'quick',
 'fruitful',
 'reproductive',
 'quality',
 'alongside',
 'lack',
 'natural',
 'predator',
 'february',
 'brook',
 'told',
 'guardian',
 

In [5]:
text_preprocessing('corpus/Myanmar military air attacks.txt', 'Tokens of Myanmar military air attacks.txt')

['myanmar',
 'military',
 'ha',
 'launched',
 'air',
 'attack',
 'central',
 'town',
 'bastion',
 'opposition',
 'coup',
 'carried',
 'year',
 'ago',
 'witnesses',
 'local',
 'medium',
 'dozen',
 'people',
 'killed',
 'wounded',
 'attack',
 'tuesday',
 'worst',
 'military',
 'seized',
 'control',
 'country',
 'citing',
 'resident',
 'sagaing',
 'area',
 '110km',
 '45',
 'mile',
 'west',
 'main',
 'city',
 'yangon',
 'news',
 'report',
 '50',
 'people',
 'including',
 'child',
 'died',
 'barrage',
 'town',
 'pazigyi',
 'air',
 'raid',
 'occurred',
 'resident',
 'gathered',
 'inauguration',
 'administrative',
 'office',
 'al',
 'jazeera',
 'tony',
 'cheng',
 'reported',
 'thailand',
 'capital',
 'bangkok',
 'response',
 'myanmar',
 'military',
 'ruler',
 'wa',
 'responder',
 'killed',
 'attack',
 'rescue',
 'work',
 'wa',
 'carried',
 'cheng',
 'reported',
 'sagaing',
 'region',
 'second-largest',
 'city',
 'mandalay',
 'ha',
 'put',
 'fiercest',
 'resistance',
 'military',
 'rule',
 'in