In [17]:
import nltk

In [61]:
# tokensize http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
# stem http://www.nltk.org/api/nltk.stem.html#module-nltk.stem
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
# corpus 数据集
from nltk.corpus import stopwords
# metrics评分
from nltk.metrics import edit_distance
# tag
from nltk.tag import pos_tag

In [58]:
s = '''Good muffins cost $3.88\nin New York!  Please buy me
two of them.\n\nThanks.'''
# 分词
print(word_tokenize(s))     # 分词
print(regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+'))# 基于正则的分词
print(wordpunct_tokenize(s))# 基于正则的分词 利用空格和标点
print(blankline_tokenize(s)) # 基于正则的分词 Uses '\s*\n\s*\n\s*'
#from nltk.tokenize import BlanklineTokenizer
#BlanklineTokenizer().tokenize(s)
#from nltk.tokenize import RegexpTokenizer
#tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
#tokenizer.tokenize(s)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '!', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '!', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '!', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good muffins cost $3.88\nin New York!  Please buy me\ntwo of them.', 'Thanks.']


In [29]:
# 断句
all_sent = sent_tokenize(s)
print(all_sent)
[word_tokenize(t) for t in sent_tokenize(s)]

['Good muffins cost $3.88\nin New York!', 'Please buy me\ntwo of them.', 'Thanks.']


[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '!'],
 ['Please', 'buy', 'me', 'two', 'of', 'them', '.'],
 ['Thanks', '.']]

In [20]:
# NLTK tokenizers can produce token-spans, 
# represented as tuples of integers having the same semantics as string slices, 
# to support efficient comparison of tokenizers.
from nltk.tokenize import WhitespaceTokenizer
list(WhitespaceTokenizer().span_tokenize(s))

[(0, 4),
 (5, 12),
 (13, 17),
 (18, 23),
 (24, 26),
 (27, 30),
 (31, 36),
 (38, 44),
 (45, 48),
 (49, 51),
 (52, 55),
 (56, 58),
 (59, 64),
 (66, 73)]

In [45]:
# 词干提取(stemming)
pst = PorterStemmer()   #波特干扰算法
lst = LancasterStemmer()#基于兰开斯特（Paice / Husk）干扰算法的词干
print(lst.stem("eating"))
print(lst.stem("ate"))
print(pst.stem("shopping"))
print(pst.stem("ate"))
# 词形还原(lemmatization)，词根(lemma)
wlem = WordNetLemmatizer()
print(wlem.lemmatize("ate"))

eat
at
shop
ate


'ate'

In [44]:
# 词形还原(lemmatization)，词根(lemma)
wlem = WordNetLemmatizer()
wlem.lemmatize("ate")

'ate'

In [48]:
# 停用词移除(Stop word removal)
stoplist = stopwords.words('english')
text = "This is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
print(cleanwordlist)

['This', 'test']


In [49]:
# 罕见词移除
'''
import nltk
token = text.split()
freq_dist = nltk.FreqDist(token)
rarewords = freq_dist.keys()[-50:]
after_rare_words = [word for word in token not in rarewords]
print(after_rare_words)
'''

'\nimport nltk\ntoken = text.split()\nfreq_dist = nltk.FreqDist(token)\nrarewords = freq_dist.keys()[-50:]\nafter_rare_words = [word for word in token not in rarewords]\nprint(after_rare_words)\n'

In [50]:
# 评估
from nltk.metrics import edit_distance
print(edit_distance("rain", "shine")) # 3

3


In [62]:
# 词性标注
# 词性(POS)
# PennTreebank
print(nltk.pos_tag(word_tokenize(s)))
print(pos_tag(word_tokenize(s)))

[('Good', 'JJ'), ('muffins', 'NNS'), ('cost', 'VBP'), ('$', '$'), ('3.88', 'CD'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('!', '.'), ('Please', 'NNP'), ('buy', 'VB'), ('me', 'PRP'), ('two', 'CD'), ('of', 'IN'), ('them', 'PRP'), ('.', '.'), ('Thanks', 'NNS'), ('.', '.')]
[('Good', 'JJ'), ('muffins', 'NNS'), ('cost', 'VBP'), ('$', '$'), ('3.88', 'CD'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('!', '.'), ('Please', 'NNP'), ('buy', 'VB'), ('me', 'PRP'), ('two', 'CD'), ('of', 'IN'), ('them', 'PRP'), ('.', '.'), ('Thanks', 'NNS'), ('.', '.')]


In [67]:
# tanford标注器
# 
from nltk.tag.stanford import StanfordPOSTagger
#stan_tagger = StanfordPOSTagger('D:/nltk_data/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger',
#                                'D:/nltk_data/stanford-postagger-full-2017-06-09/stanford-postagger.jar')
#s = "I was watching TV"
#tokens = nltk.word_tokenize(s)
#stan_tagger.tag(tokens)


In [65]:
# 深入了解标注器
from nltk.corpus import brown
import nltk
tags = [tag for (word, tag) in brown.tagged_words(categories = 'news')]
print(nltk.FreqDist(tags))

brown_tagged_sents = brown.tagged_sents(categories = 'news')
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))

<FreqDist with 218 samples and 100554 outcomes>
0.13089484257215028


In [72]:
# 1 N-Gram标注器
from nltk.tag import UnigramTagger# 在训练语料库中找到每个单词的最可能的标签，然后使用该信息为新的标记分配标签。
test_sent = brown.sents(categories='news')[0]
print(test_sent)
unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
for tok, tag in unigram_tagger.tag(test_sent):
    print("(%s, %s), " % (tok, tag))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
(The, AT), 
(Fulton, NP-TL), 
(County, NN-TL), 
(Grand, JJ-TL), 
(Jury, NN-TL), 
(said, VBD), 
(Friday, NR), 
(an, AT), 
(investigation, NN), 
(of, IN), 
(Atlanta's, NP$), 
(recent, JJ), 
(primary, NN), 
(election, NN), 
(produced, VBD), 
(``, ``), 
(no, AT), 
(evidence, NN), 
('', ''), 
(that, CS), 
(any, DTI), 
(irregularities, NNS), 
(took, VBD), 
(place, NN), 
(., .), 


In [73]:
from nltk.tag import DefaultTagger#为每个token分配相同标记的标记器
default_tagger = DefaultTagger('NN')
list(default_tagger.tag('This is a test'.split()))

[('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]

In [74]:
from nltk.tag import BigramTagger  # 根据前一个的标签
from nltk.tag import TrigramTagger # 根据前两个的标签
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data)) # 根据标准评分标签的准确性。
biggram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(biggram_tagger.evaluate(test_data))
trigram_tagger = TrigramTagger(train_data, backoff=biggram_tagger)
print(trigram_tagger.evaluate(test_data))

0.8364397488288647
0.84570915977275
0.84411442240606


In [75]:
# 2 正则表达式标注器
from nltk.tag.sequential import RegexpTagger
regexp_tagger = RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'(The|the|A|a|An|an)$', 'AT'),  # articles
     (r'.*able$', 'JJ'), # adjectives
     (r'.*ness$', 'NN'), # nouns formed from adj
     (r'.*ly$', 'RB'),   # adverbs
     (r'.*s$', 'NNS'),   # plural nouns
     (r'.*ing$', 'VBG'), # gerunds
     (r'.*ed$', 'VBD'),  # past tense verbs
     (r'.*', 'NN')       # nouns (default)
    ])
print(regexp_tagger.evaluate(test_data))

0.31306687929831556


In [76]:
# 3.1.4 Brill 标注器
# 3.1.5 基于机器学习的标注器
# 最大熵分类器(MEC)
# 隐性马尔科夫模型(HMM)
# 条件随机场(CRF)

In [86]:
# 命名实体识别(NER)
# NER标注器
import nltk
from nltk import ne_chunk
tokens = nltk.word_tokenize('I am very excited about the next generation of Apple products.')
tokens = nltk.pos_tag(tokens)
print (tokens)
tree = nltk.ne_chunk(tokens)
print (tree)

[('I', 'PRP'), ('am', 'VBP'), ('very', 'RB'), ('excited', 'JJ'), ('about', 'IN'), ('the', 'DT'), ('next', 'JJ'), ('generation', 'NN'), ('of', 'IN'), ('Apple', 'NNP'), ('products', 'NNS'), ('.', '.')]
(S
  I/PRP
  am/VBP
  very/RB
  excited/JJ
  about/IN
  the/DT
  next/JJ
  generation/NN
  of/IN
  (GPE Apple/NNP)
  products/NNS
  ./.)


In [None]:
# 
#from nltk.tag.stanford import StanfordNERTagger
# https://nlp.stanford.edu/software/stanford-ner-2017-06-09.zip
#st = StanfordNERTagger('D:/nltk_data/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'D:/nltk_data/stanford-ner-2017-06-09/stanford-ner.jar')
#st.tag('Rami Eid is studying at Stony Brook University in NY'.split())