In [1]:
import nltk
from tools import *
%matplotlib inline

# Chap 3 处理原始文本
1.  如何访问文件内的文本？
2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？
3.  如何产生格式化的输出，并把结果保存在文件中？

## 3.9 格式化：从链表到字符串(P126)

### 3.9.1 从链表转换为字符串

In [2]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print(' '.join(silly))
print(';'.join(silly))
print(''.join(silly))

We called him Tortoise because he taught us .
We;called;him;Tortoise;because;he;taught;us;.
WecalledhimTortoisebecausehetaughtus.


### 3.9.2 字符串显示方式（两种）

In [3]:
word = 'cat'
print(word)
print(word.encode('utf-8'))

cat
b'cat'


In [4]:
# print()函数按文本输出的格式输出，sentence或者 sentence.encode()则按字符串具体的内容输出
sentence = """hello 
world"""
print(sentence)  # 以可读的形式输出对象的内容
print(sentence.encode('utf-8')) # 变量提示

hello 
world
b'hello \nworld'


In [5]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
fdist.tabulate()
# 三种格式化输出文本的方法
# %s(字符串) 和 %d(十进制整数) 为转换说明符
for word in sorted(fdist):
    print(word, '->', fdist[word], end=':\t')
    print('%s->%d' % (word, fdist[word]), end='. \n')
    print('{}->{}'.format(word, fdist[word]), end='; \t')  # fromat()函数格式化输出文本
    print('{1}->{0}'.format(fdist[word], word), end=', \t')

  dog   cat snake 
    4     3     1 
cat -> 3:	cat->3. 
cat->3; 	cat->3, 	dog -> 4:	dog->4. 
dog->4; 	dog->4, 	snake -> 1:	snake->1. 
snake->1; 	snake->1, 	

In [6]:
template = 'Lee wants a {} right now.'
menu = ['sandwich', 'spam fritter', 'pancake']
for snack in menu:
    print(template.format(snack))

Lee wants a sandwich right now.
Lee wants a spam fritter right now.
Lee wants a pancake right now.


### 3.8.3 排列

In [7]:
# 将文本按列排版
print("左边靠齐，6个字符=> |{:6}|{:6}|{:6}|".format('dog', 'cat', 'man'))
print("右边靠齐，6个字符=> |{:>6}|{:>6}|{:>6}|".format('dog', 'cat', 'man'))

左边靠齐，6个字符=> |dog   |cat   |man   |
右边靠齐，6个字符=> |   dog|   cat|   man|


In [8]:
import math

# 浮点数，小数点后4位
print('{:.4f}'.format(math.pi))

3.1416


In [9]:
count, total = 3205, 9375
# 百分数，小数点后4位
print('accuracy for {} words: {:.4%}'.format(total, count / total))

accuracy for 9375 words: 34.1867%


In [10]:
# Ex3-5 布朗语料库中情态动词在不同类别中的频率统计
def tabulate(cfdist, words, categories):
    print('{:16}'.format('Category'), end=' ')
    for word in words:  # 不同情态动词的题头
        print('{:>6}'.format(word), end=' ')
    print()
    for category in categories:  # 不同类别
        print('{:16}'.format(category), end=' ')
        for word in words:  # 不同情态动词
            print('{:6}'.format(cfdist[category][word]), end=' ')
        print()

In [11]:
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
print("cfd['news']= ", cfd['news'])
print("cfd['news']['can']= ", cfd['news']['can'])
tabulate(cfd, modals, genres)

cfd['news']=  <FreqDist with 14394 samples and 100554 outcomes>
cfd['news']['can']=  93
Category            can  could    may  might   must   will 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
hobbies             268     58    131     22     83    264 
science_fiction      16     49      4     12      8     16 
romance              74    193     11     51     45     43 
humor                16     30      8      8      9     13 


In [12]:
# 通过使用变量指定字段的宽度
print('{:{width}}'.format('Monty Python', width=15) + '!')
print(''.join([str(i) for i in range(10)])*2)

Monty Python   !
01234567890123456789


### 3.9.4 将结果写入文件(P130)

In [13]:
# 输出文件的两种方式：print(str,file=output_file), output_file.write(str)
# print()输出时默认在行结束时加入了换行符
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
    print(word, file=output_file)
print(str(len(words)), file=output_file)
output_file.write('zYx.Tom')  # 返回写入的字符个数
output_file.write(str(len(words)) + '\n')  # 没有'\n'则会连续写，不换行
output_file.flush()  # 刷新写文件缓冲区
output_file.close()

### 3.9.5 文本换行(Text Wrapping)(P131)

In [14]:
# 文本过长，到行尾溢出
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.']

for word in saying:
    print(word, '(' + str(len(word)) + ')', end=' ')

After (5) all (3) is (2) said (4) and (3) done (4) , (1) more (4) is (2) said (4) than (4) done (4) . (1) 

In [15]:
# 文本显示时自动换行
from textwrap import fill

format = '%s_(%d)'
pieces = [format % (word, len(word)) for word in saying]
output = ', '.join(pieces)
wrapped = fill(output)  # 自动换行显示
show_subtitle(format)
print(wrapped)

--------------- >%s_(%d)< ---------------
After_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),
more_(4), is_(2), said_(4), than_(4), done_(4), ._(1)


In [16]:
format = '{}_({})'
pieces = [f'{word}_({len(word)})' for word in saying]
output = ', '.join(pieces)
wrapped = fill(output)  # 自动换行显示
show_subtitle(format)
print(wrapped)

--------------- >{}_({})< ---------------
After_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),
more_(4), is_(2), said_(4), than_(4), done_(4), ._(1)


## 3.10 小结

-   字符串中的字符是使用索引来访问的，索引从零开始计数(`str[0]`)
-   子字符串使用切片符号访问(`str[3:5]`)
-   字符串可以被分割成链表(`str.split()`);链表还可以连接成字符串`''.join(list)`。
-   文本可以从文件中读取，也可以从URL地址中读取。
-   分词是将文本分割成基本单位或者标记，例如：词和标点符号等。基于空格符的分词无法满足应用需要。
-   词形归并是一个过程，将一个词的各种形式遇到这个词的标准形式或者引用形式，也称为词位或者词元。
-   正则表达式是用来指定模式的方法，re.findall() 可以找到一个字符串中匹配一个模式的所有子字符串。
-   在正则字符串前加上前缀`r`，提醒 Python 这个是正则表达式的字符串，不要处理包含的反斜杠。
-   字符串格式化表达式包含格式字符串及转换标识符。