## 词干提取   stemming

In [21]:
from nltk.stem.porter import PorterStemmer
import nltk

In [22]:
porter_stemmer = PorterStemmer()

In [23]:
porter_stemmer.stem('stepping')#词干提取

'step'

In [24]:
porter_stemmer.stem('multiply')

'multipli'

In [25]:
#词性归一 lemmatization

In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
wordnet_lemmatizer = WordNetLemmatizer()

In [28]:
wordnet_lemmatizer.lemmatize('dogs')

'dog'

## 标注词性 pos_tag

In [17]:
wordnet_lemmatizer.lemmatize('are', pos='v')#pos_tag默认是nn

'be'

In [30]:
text = nltk.word_tokenize('what does the fox say')

In [31]:
text

['what', 'does', 'the', 'fox', 'say']

In [34]:
nltk.pos_tag(text)

[('what', 'WDT'),
 ('does', 'VBZ'),
 ('the', 'DT'),
 ('fox', 'NNS'),
 ('say', 'VBP')]

## 简单的ML情感分词

In [37]:
from nltk.classify import NaiveBayesClassifier
# 随⼿造点训练集
s1 = 'this is a good book'
s2 = 'this is a awesome book'
s3 = 'this is a bad book'
s4 = 'this is a terrible book'
def preprocess(s):
# Func: 句⼦处理
# 这⾥简单的⽤了split(), 把句⼦中每个单词分开
# 显然 还有更多的processing method可以⽤
    return {word: True for word in s.lower().split()}
    #字典生成式
    # {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}

# 把训练集给做成标准形式
training_data = [[preprocess(s1), 'pos'],
                 [preprocess(s2), 'pos'],
                 [preprocess(s3), 'neg'],
                 [preprocess(s4), 'neg']
                ]
# 喂给model吃
model = NaiveBayesClassifier.train(training_data)
# 打出结果
print(model.classify(preprocess('this is a good book')))

pos


# 词频统计

In [41]:
from nltk import FreqDist
# 做个词库先
corpus = 'this is my sentence ' \
'this is my life ' \
'this is the day'
# 随便tokenize⼀下
# 显然, 正如上⽂提到,
# 这⾥可以根据需要做任何的preprocessing:
# stopwords, lemma, stemming, etc.
tokens = nltk.word_tokenize(corpus)
print(tokens)
# 得到token好的word list
# ['this', 'is', 'my', 'sentence',
# 'this', 'is', 'my', 'life', 'this',
# 'is', 'the', 'day']

['this', 'is', 'my', 'sentence', 'this', 'is', 'my', 'life', 'this', 'is', 'the', 'day']


In [50]:
# 借⽤NLTK的FreqDist统计⼀下⽂字出现的频率
fdist = FreqDist(tokens)
# 它就类似于⼀个Dict
# 带上某个单词, 可以看到它在整个⽂章中出现的次数
print(fdist['this'])
# 3

3


In [60]:
# 词频最高的10个单词拿出来
standard_freq_vector = fdist.most_common(10)
size = len(standard_freq_vector)
print(size)
print(standard_freq_vector)

7
[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]


In [61]:
# Func: 按照出现频率⼤⼩, 记录下每⼀个单词的位置
def position_lookup(v):
    res = {}
    counter = 0
    for word in v:
        res[word[0]] = counter
        counter += 1
    return res
    # 把标准的单词位置记录下来

standard_position_dict = position_lookup(standard_freq_vector)
print(standard_position_dict)

{'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6}


In [62]:
# 这时, 如果我们有个新句⼦:
sentence = 'this is cool'
# 先新建⼀个跟我们的标准vector同样⼤⼩的向量
freq_vector = [0] * size
# 简单的Preprocessing
tokens = nltk.word_tokenize(sentence)
# 对于这个新句⼦⾥的每⼀个单词
for word in tokens:
    try:
        # 如果在我们的词库⾥出现过
        # 那么就在"标准位置"上+1
        freq_vector[standard_position_dict[word]] += 1
    except KeyError:
        # 如果是个新词
        # 就pass掉
        continue
print(freq_vector)
# [1, 1, 0, 0, 0, 0, 0]
# 第⼀个位置代表 is, 出现了⼀次
# 第⼆个位置代表 this, 出现了⼀次

[1, 1, 0, 0, 0, 0, 0]


## 统计单词出现的数量

In [None]:
# -*- coding:utf-8 -*-
import io
import re


class Counter:
    def __init__(self, path):
        self.mapping = dict()

        with io.open(path, encoding="utf-8") as f:
            data = f.read()
            words = [s.lower() for s in re.findall("\w+", data)]

            for word in words:
                #若字典中没有键对应的值则返回默认值
                #终于明白如何解决字典的计数增加问题。
                self.mapping[word] = self.mapping.get(word, default=0) + 1

    def most_common(self, n):
        assert n > 0, "n should be large than 0"
        return sorted(self.mapping.items(), key=lambda item: item[1], reverse=True)[:n]


if __name__ == '__main__':
    most_common_5 = Counter("importthis.txt").most_common(5)
    for item in most_common_5:
        print(item)

In [65]:
#上面的代码解决了我一直困惑的字典value计数问题

## TF_IDF,  Term Frequency, Inverse Document Frequency
字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。
TF(t) = (t出现在⽂档中的次数) / (⽂档中的term总数).
IDF(t) = log_e(⽂档总数 / 含有t的⽂档总数).

In [70]:
from nltk.text import TextCollection

# 首先，把所有的文档放到TextCollection类中
# 这个类会自动帮你断句，做统计，做计算
corpus = TextCollection(['this is sentence one',
                         'this is sentence two',
                         ' is sentence three'])

# 直接就能算出tfidf
# (term:一句话中的某个term,text:这句话)
print(corpus.tf_idf('this', 'this is sentence four'))

# 对于每个新句子
new_sentence='this is sentence five'
# 遍历一遍所有的vocabulary中的词：
standard_vocab=['this' 'is' 'sentence' 'one' 'two' 'five']
for word in standard_vocab:
    print(corpus.tf_idf(word, new_sentence))

0.01930786229086497
0.0


## word2vec

In [None]:
# 简单的⽂字预处理：
# 1. 去除HTML
# 这⾥⽤到BeautifulSoup这个库，
# 当然，这种简单的事情，也可以⾃⼰做个字符串运算解决
from bs4 import BeautifulSoup
beautiful_text = BeautifulSoup(raw_text).get_text()
#
# 2. 把⾮字⺟的去除掉
# 这⾥可以⽤正则表达式解决
import re
letters_only = re.sub("[^a-zA-Z]", " ", beautiful_text)
#
# 3. 全部⼩写化
words = letters_only.lower().split()
#
# 4. 去除stopwords
# 这⾥⽤到NLTK
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
# ⾼阶⽂字处理：
# 5. Lemmatization
#
# 这个⽐较复杂，下次NLTK的时候讲
# 6. 搞回成⼀⻓串string
return( " ".join( meaningful_words ))

In [None]:
# tokenizor: 把原来的string训练集，变成 list of lists：
# 这个寒⽼师上堂课应该讲过：
# 简单点的话，可以⽤这个：
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# 达到这样的效果：
>>> print sentences
# 原⽂： ['Hello, how are you', 'im fine, thank you, and you?']
[['hello', 'how'], ['fine', 'thank']]
# 现在进⼊正题， w2v。
# 我们⽤Gensim这个库来做，很⽅便。
from gensim.models import word2vec
# 先设⼀下param
num_features = 1000 # 最多多少个不同的features
min_word_count = 10 # ⼀个word，最少出现多少次 才被计⼊
num_workers = 4 # 多少thread⼀起跑（快⼀点⼉）
size = 256 # vec的size
window = 5 # 前后观察多⻓的“语境”
# 跑起来
model = word2vec.Word2Vec(sentences, size=size, workers=num_workers,\
                          size=num_features, min_count = min_word_count,\
                          window = window)
# 你可以save下来
model.save('LOL.save')
# ⽇后再load回来
model = word2vec.Word2Vec.load('LOL.save')

In [None]:
# 当然 你们也许会看到⾕歌也提供了⾃⼰的News包：
# 要load 其他语⾔train出来的⽂件（⽐如C) 的Bin或者text⽂件
# 那就这样：
model = Word2Vec.load_word2vec_format('google_news.txt', binary=False) # C text format
model = Word2Vec.load_word2vec_format('google_news.bin', binary=True) # C binary format
# ⼏个常⽤的⽤法：
# woman + king - man = queen
>>> model.most_similar(positive=['woman', 'king'], negative=['man'])
[('queen', 0.50882536), ...]
# 求两个词的senmatics相似度
>>> model.similarity('woman', 'man')
0.73723527
# 就更dict⼀样使⽤你train好的model
>>> model['computer']
array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)
# 现在 你可以把这个model包装起来。把你所有的sentences token 过⼀遍
def w2vmodel(sentences):
...
return vec

In [None]:
# 这个时候你会发现，我们的vec是针对每个word的。⽽我们的训练集 是sen和label互相对应的，
# ⼯业上，到了这⼀步，有三种解决⽅案：
# 1. 平均化⼀个句⼦⾥所有词的vec。
# sen_vec = [vec, vec, vec, ...] / n
# 2. 排成⼀个⼤matrix (M * N)，等着CNN来搞
# [ vec | vec | vec | vec | ... ]
# 3. ⽤Doc2Vec。这是基于句⼦的vec，跟word2vec差不多思路，⽤起来也差不多。
# 只对⻓篇 ⼤⽂章效果好。对头条新闻， twitter这种的东⻄，就不⾏了。每个“篇”的句⼦太少。
# 具体可以看gensim。
# Anyway, 这⼀步完成后，你会对于每个训练集的X，得到⼀个固定⻓度的vec或者matrix
# 接下来的事情，⼤家就可以融会贯通了。
# ⽐如，可以⽤前⾯冯⽼师讲的RF跑⼀遍 做classification。

## 计算两个字符串之间的距离

In [74]:
import Levenshtein
s1 = 'kitten'
s2 = 'sitting'
 
ratio = Levenshtein.ratio(s1, s2)
dist = Levenshtein.distance(s1, s2)
print('ratio={0}, dist={1}'.format(ratio, dist))

ratio=0.6153846153846154, dist=3


seqratio,计算两个字符串之间的相似度

In [77]:
a = ['1','2','3','4','5']
b = ['2','3','4']
 
Levenshtein.seqratio(a, b)# 0.75

0.75