In [8]:
from sklearn.base import TransformerMixin

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [{word: True for word in nltk.word_tokenize(document)} for document in X]
#返回list每个元素是一条消息的dict，dict{key是单词: value是True}
from sklearn.feature_extraction import DictVectorizer
#DictVectorizer类接受元素为字典的列表，将其转换为矩阵。矩阵中的各个特征为所有字典中的每个键，特征值就是特征在文本中是否出现
from sklearn.naive_bayes import BernoulliNB

In [9]:
import os, json, nltk
from sklearn.model_selection import cross_val_score
input_filename = os.path.join(os.path.expanduser("~"), "ANACONDA DM", "chapter6", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "ANACONDA DM", "chapter6", "python_classes.json")
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
with open(labels_filename) as inf:
    labels = json.load(inf)
    
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('bag-of-words', NLTKBOW()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB())])
scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.836


In [25]:
#find the features

#fit the process
model = pipeline.fit(tweets, labels)
#find the probabilities for featuers
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_
top_features = np.argsort(-feature_probabilities[1])[:50] #只是给出特征索引值而没有给出实际的特征名称,
# map the feature_index to feature_name
dv = model.named_steps['vectorizer']
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

0 : 0.950495049505
1 https 0.663366336634
2 @ 0.584158415842
3 Python 0.564356435644
4 RT 0.524752475248
5 # 0.455445544554
6 python 0.277227722772
7 , 0.267326732673
8 with 0.217821782178
9 and 0.188118811881
10 ) 0.178217821782
11 ( 0.178217821782
12 . 0.178217821782
13 for 0.178217821782
14 a 0.168316831683
15 to 0.138613861386
16 - 0.128712871287
17 ☞ 0.128712871287
18 of 0.128712871287
19 in 0.118811881188
20 on 0.108910891089
21 Science 0.0891089108911
22 you 0.0891089108911
23 your 0.0891089108911
24 Data 0.0891089108911
25 ? 0.0792079207921
26 ; 0.0693069306931
27 Create 0.0693069306931
28 Developer 0.0693069306931
29 .mkdir 0.0693069306931
30 pathlib.Path.mkdir 0.0693069306931
31 pathlib.Path 0.0693069306931
32 pathlib 0.0693069306931
33 htt… 0.0693069306931
34 directories 0.0693069306931
35 PythonWeekly 0.0693069306931
36 import 0.0693069306931
37 “ 0.0693069306931
38 new 0.0693069306931
39 I 0.0693069306931
40 amp 0.0693069306931
41 method 0.0693069306931
42 the 0.0693069306

In [24]:
nb.__dict__

{'alpha': 1.0,
 'binarize': 0.0,
 'class_count_': array([ 51.,  99.]),
 'class_log_prior_': array([-1.07880966, -0.41551544]),
 'class_prior': None,
 'classes_': array([0, 1]),
 'feature_count_': array([[  7.,  13.,   4., ...,   2.,   2.,   0.],
        [  1.,  45.,   6., ...,   0.,   0.,   2.]]),
 'feature_log_prob_': array([[-1.89085037, -1.33123458, -2.360854  , ..., -2.87167962,
         -2.87167962, -3.97029191],
        [-3.92197334, -0.78647912, -2.66921037, ..., -4.61512052,
         -4.61512052, -3.51650823]]),
 'fit_prior': True}

In [28]:
from sklearn.externals import joblib
output_filename = os.path.join(os.path.expanduser("~"), "ANACONDA DM", "chapter6", "python_context.pkl")
joblib.dump(model, output_filename)

['C:\\Users\\Administrator\\ANACONDA DM\\chapter6\\python_context.pkl']