In [1]:
import re
import nltk
from tools import show_subtitle
%matplotlib inline

# Chap 3 处理原始文本
1.  如何访问文件内的文本？
2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？
3.  如何产生格式化的输出，并把结果保存在文件中？

## 3.7. 用正则表达式为文本分词(P118)
分词（Tokenization）：是将字符串切割成可以识别的构成语言数据的语言单元。

### 3.7.1 分词的简单方法

P109 表3-3 正则表达式基本元字符，P120 表3-4 正则表达式符号

In [2]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful 
tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very 
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [3]:
print(re.split(r' ', raw))  # 利用空格分词，没有去除'\t'和'\n'

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', '\ntone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', '\nwell', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [4]:
print(re.split(r'[ \t\n]+', raw))  # 利用空格、'\t'和'\n'分词，但是不能去除标点符号

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [5]:
print(re.split(r'\s', raw))  # 使用re库内置的'\s'（匹配所有空白字符）分词，但是不能去除标点符号

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', '', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', '', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [6]:
print(re.split(r'\W+', raw))  # 利用所有字母、数字和下划线以外的字符来分词，但是将“I'm”、“won't”这样的单词拆分了

['', 'When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered', '']


In [7]:
print(re.findall(r'\w+|\S\w*', raw))  # 使用findall()分词，可以将标点保留，不会出现空字符串

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'", '.', '.', '.']


In [8]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))  # 利用规则使分词更加准确

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


### 3.7.2 NLTK 的正则表达式分词器(P120)

In [9]:
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)    # set flag to allow verbose regexps
    (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
  | \w+(?:-\w+)*        # words with optional internal hyphens
  | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
  | \.\.\.            # ellipsis
  | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
nltk.regexp_tokenize(text, pattern)

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']

In [10]:
print("'(?x)'= ",nltk.regexp_tokenize(text, '(?x)'))

'(?x)'=  ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [11]:
print("'([A-Z]\.)'= ", nltk.regexp_tokenize(text, '([A-Z]\.)'))
print("'([A-Z]\.)+'= ", nltk.regexp_tokenize(text, '([A-Z]\.)+'))
print("'(?:[A-Z]\.)+'= ", nltk.regexp_tokenize(text, '(?:[A-Z]\.)+'))

'([A-Z]\.)'=  ['U.', 'S.', 'A.']
'([A-Z]\.)+'=  ['A.']
'(?:[A-Z]\.)+'=  ['U.S.A.']


In [12]:
print("'\w'= ",nltk.regexp_tokenize(text, '\w'))  
print("'\w+'= ",nltk.regexp_tokenize(text, '\w+'))  
print("'\w(\w)'= ",nltk.regexp_tokenize(text, '\w(\w)'))# 每连续两个单词标准的字母，取后面那个字母
print("'\w+(\w)'= ",nltk.regexp_tokenize(text, '\w+(\w)'))# 每个单词，取最后那个字母
print("'\w(-\w)'= ",nltk.regexp_tokenize(text, '\w(-\w)'))
print("'\w+(-\w)'= ",nltk.regexp_tokenize(text, '\w+(-\w)'))
print("'\w(-\w+)'= ",nltk.regexp_tokenize(text, '\w(-\w+)'))
print("'\w+(-\w+)'= ",nltk.regexp_tokenize(text, '\w+(-\w+)'))
print("'\w(-\w+)*'= ",nltk.regexp_tokenize(text, '\w(-\w+)*'))
print("'\w+(-\w+)*'= ",nltk.regexp_tokenize(text, '\w+(-\w+)*'))

'\w'=  ['T', 'h', 'a', 't', 'U', 'S', 'A', 'p', 'o', 's', 't', 'e', 'r', 'p', 'r', 'i', 'n', 't', 'c', 'o', 's', 't', 's', '1', '2', '4', '0']
'\w+'=  ['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']
'\w(\w)'=  ['h', 't', 'o', 't', 'r', 'r', 'n', 'o', 't', '2', '0']
'\w+(\w)'=  ['t', 'r', 't', 's', '2', '0']
'\w(-\w)'=  ['-p']
'\w+(-\w)'=  ['-p']
'\w(-\w+)'=  ['-print']
'\w+(-\w+)'=  ['-print']
'\w(-\w+)*'=  ['', '', '', '', '', '', '', '', '', '', '', '', '-print', '', '', '', '', '', '', '', '', '']
'\w+(-\w+)*'=  ['', '', '', '', '-print', '', '', '']


In [13]:
print("'\w+(?:)'))= ",nltk.regexp_tokenize(text, '\w+(?:)'))  
print("'\w+(?:)+'))= ",nltk.regexp_tokenize(text, '\w+(?:)+'))  
print("'\w+(?:\w)'))= ",nltk.regexp_tokenize(text, '\w+(?:\w)'))  
print("'\w+(?:\w+)'))= ",nltk.regexp_tokenize(text, '\w+(?:\w+)'))  
print("'\w+(?:\w)*'))= ",nltk.regexp_tokenize(text, '\w+(?:\w)*'))  
print("'\w+(?:\w+)*'))= ",nltk.regexp_tokenize(text, '\w+(?:\w+)*'))  

'\w+(?:)'))=  ['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']
'\w+(?:)+'))=  ['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']
'\w+(?:\w)'))=  ['That', 'poster', 'print', 'costs', '12', '40']
'\w+(?:\w+)'))=  ['That', 'poster', 'print', 'costs', '12', '40']
'\w+(?:\w)*'))=  ['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']
'\w+(?:\w+)*'))=  ['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40']


In [14]:
print("'\.\.\.'= ", nltk.regexp_tokenize(text, '\.\.\.'))

'\.\.\.'=  ['...']


In [15]:
print("'\.\.\.|([A-Z]\.)+'= ", nltk.regexp_tokenize(text, '\.\.\.|(?:[A-Z]\.)+'))

'\.\.\.|([A-Z]\.)+'=  ['U.S.A.', '...']


In [16]:
# (?:) 非捕捉组用法对比
inputStr = "hello 123 world 456 nihao 789"
rePatternAllCapturingGroup = "\w+ (\d+) \w+ (\d+) \w+ (\d+)"
rePatternWithNonCapturingGroup = "\w+ (\d+) \w+ (?:\d+) \w+ (\d+)"
show_subtitle(rePatternAllCapturingGroup)
print(nltk.regexp_tokenize(inputStr, rePatternAllCapturingGroup))
show_subtitle(rePatternWithNonCapturingGroup)
print(nltk.regexp_tokenize(inputStr, rePatternWithNonCapturingGroup))

--------------- >\w+ (\d+) \w+ (\d+) \w+ (\d+)< ---------------
[('123', '456', '789')]
--------------- >\w+ (\d+) \w+ (?:\d+) \w+ (\d+)< ---------------
[('123', '789')]


### 3.7.3 进一步讨论分词
分词：比预期更为艰巨，没有任何单一的解决方案可以在所有领域都行之有效。

在开发分词器时，访问已经手工飘游好的原始文本则理有好处，可以将分词器的输出结果与高品质(也叫「黄金标准」)的标注进行比较。