文本处理

In [24]:
# 加载数据集
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
from sklearn.datasets import load_files
container_path = r".\20newsbydate\20news-bydate-train"
twenty_train = load_files(container_path=container_path,categories=categories,shuffle=True,random_state=940302)

In [25]:
print('标签类别:%s'%twenty_train.target_names)

标签类别:['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [26]:
twenty_train.data[0]

b'From: zyeh@caspian.usc.edu (zhenghao yeh)\nSubject: Ellipse Again\nOrganization: University of Southern California, Los Angeles, CA\nLines: 39\nDistribution: world\nNNTP-Posting-Host: caspian.usc.edu\nKeywords: ellipse\n\n\nHi! Everyone,\n\nBecause no one has touched the problem I posted last week, I guess\nmy question was not so clear. Now I\'d like to describe it in detail:\n\nThe offset of an ellipse is the locus of the center of a circle which\nrolls on the ellipse. In other words, the distance between the ellipse\nand its offset is same everywhere.\n\nThis problem comes from the geometric measurement when a probe is used.\nThe tip of the probe is a ball and the computer just outputs the\npositions of the ball\'s center. Is the offset of an ellipse still\nan ellipse? The answer is no! Ironically, DMIS - an American Indutrial\nStandard says it is ellipse. So almost all the software which was\nimplemented on the base of DMIS was wrong. The software was also sold\ninternationaly. Im

In [27]:
len(twenty_train.data)

2257

In [28]:
print(twenty_train.target[:10])
for t in twenty_train.target[:10]:
    print(t,twenty_train.target_names[t])
    

[1 1 1 3 2 2 3 2 3 3]
1 comp.graphics
1 comp.graphics
1 comp.graphics
3 soc.religion.christian
2 sci.med
2 sci.med
3 soc.religion.christian
2 sci.med
3 soc.religion.christian
3 soc.religion.christian


#### 词袋模型
需要一个词典，大小为VOC

对于每一个文档，某词出现的总数作为其特征值，这样每一个文档的向量维度为1*VOC

整个数据集全部表示成这样，维度为 n_samples*VOC，占用字节4*n_samples*VOC(如果有10000个文本，词典有100000个词，则占用4GB)

注意到文档这样的表示向量是非常稀疏的

可以通过只在内存中保存特征向量中非0的部分节省大量内存，scipy.sparse矩阵正是能完成这种操作的数据结构



#### 分词

文本的预处理, 分词以及去停用词都被包含在一个高级组件

该组件可以构建特征字典和将文档转换成特征向量的



In [39]:
from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer() #有解码错误 UnicodeDecodeError
count_vect = CountVectorizer(decode_error='ignore') 
X_train_counts = count_vect.fit_transform(twenty_train.data)
print('训练集维度：{}'.format(X_train_counts.shape))

训练集维度：(2257, 35787)


In [40]:
count_vect.vocabulary_.get(u'algorithm') #文档集中出现的总次数

4690

In [41]:
print('字典大小：%d'%len(count_vect.vocabulary_) )

字典大小：35787


#### 从次数到频率
词频TF

逆文档频率IDF


In [42]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35787)

#### 训练分类器
**多分类任务**

先来一个SVM

In [48]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

<bound method BaseEstimator.get_params of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>

In [50]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
# 不需要fit，直接transform
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


#### 构建Pipeline

In [52]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [53]:
import numpy as np
test_path = r".\20newsbydate\20news-bydate-test"
twenty_test = load_files(container_path=test_path,
    categories=categories, shuffle=True, random_state=940302)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.83488681757656458

#### 更换分类器 比较结果

In [54]:
from sklearn.linear_model import SGDClassifier
sgd_text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
sgd_text_clf.fit(twenty_train.data, twenty_train.target)  

sgd_predicted = sgd_text_clf.predict(docs_test)
np.mean(sgd_predicted == twenty_test.target) 

0.91211717709720375

In [56]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))
print('---------------------------------------------------------------------------------')
print(metrics.classification_report(twenty_test.target, sgd_predicted,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502

---------------------------------------------------------------------------------
                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.95      0.89      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [57]:
print(metrics.confusion_matrix(twenty_test.target, predicted))
print('-------------------------------------------------------')
print(metrics.confusion_matrix(twenty_test.target, sgd_predicted))

[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]
-------------------------------------------------------
[[258  11  14  36]
 [  4 381   2   2]
 [  4  36 353   3]
 [  5  11   4 378]]


In [66]:
from sklearn.neural_network import MLPClassifier
mlp_text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MLPClassifier(alpha=0.1,random_state=940302))
])
mlp_text_clf.fit(twenty_train.data, twenty_train.target)  

mlp_predicted = mlp_text_clf.predict(docs_test)
np.mean(mlp_predicted == twenty_test.target) 

0.91944074567243672

In [67]:
print('------------------------MLP------------------------------')
print(metrics.classification_report(twenty_test.target, mlp_predicted,target_names=twenty_test.target_names))
print('------------------------------------------------------')
print(metrics.confusion_matrix(twenty_test.target, mlp_predicted))

------------------------MLP------------------------------
                        precision    recall  f1-score   support

           alt.atheism       0.95      0.83      0.89       319
         comp.graphics       0.90      0.96      0.93       389
               sci.med       0.94      0.91      0.93       396
soc.religion.christian       0.90      0.96      0.93       398

           avg / total       0.92      0.92      0.92      1502

------------------------------------------------------
[[264   8  14  33]
 [  4 374   6   5]
 [  5  25 362   4]
 [  4  10   3 381]]


#### GridSearch
参数搜索

In [75]:
from sklearn.model_selection import GridSearchCV
# 参数里包含pipe各个步骤的
# 形式为：步骤名+'__'+参数名
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf,parameters)
# 使用少量数据来加快得出结果
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
print(gs_clf.cv_results_)
print('==========================')
print('NB 最佳分数：{}'.format(gs_clf.best_score_))                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


{'mean_fit_time': array([ 0.11272264,  0.41321683,  0.09808366,  0.37499539,  0.1014092 ,
        0.33843939,  0.09970959,  0.34205898]), 'std_fit_time': array([ 0.03384394,  0.02657769,  0.00783809,  0.00646191,  0.00965604,
        0.02973199,  0.00454739,  0.02653395]), 'mean_score_time': array([ 0.03954943,  0.12366796,  0.03623621,  0.0777936 ,  0.03324254,
        0.07877556,  0.0378983 ,  0.07247313]), 'std_score_time': array([ 0.00589848,  0.04292752,  0.00204914,  0.00373202,  0.00248752,
        0.00666066,  0.00564296,  0.00308289]), 'param_clf__alpha': masked_array(data = [0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
             mask = [False False False False False False False False],
       fill_value = ?)
, 'param_tfidf__use_idf': masked_array(data = [True True False False True True False False],
             mask = [False False False False False False False False],
       fill_value = ?)
, 'param_vect__ngram_range': masked_array(data = [(1, 1) (1, 2) (1, 1) (1, 2) (1,

In [78]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__loss':('hinge','modified_huber'),
              'clf__penalty':('l1','l2'),            
}
gs_sgd_clf = GridSearchCV(sgd_text_clf,parameters)
gs_sgd_clf = gs_sgd_clf.fit(twenty_train.data, twenty_train.target)
print(gs_sgd_clf.cv_results_)
print('==========================')
print('SGD 最佳分数：{}'.format(gs_sgd_clf.best_score_))                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_sgd_clf.best_params_[param_name]))

{'mean_fit_time': array([ 0.55257718,  1.97323767,  0.4883062 ,  1.8908004 ,  0.46863023,
        1.79583661,  0.47405513,  1.76170047,  0.51560243,  1.9744308 ,
        0.49653546,  1.9365259 ,  0.48456621,  1.81266443,  0.46165156,
        1.89812128,  0.53225549,  2.00030406,  0.52555895,  1.94912092,
        0.48234979,  1.8696537 ,  0.48069016,  1.79186384,  0.52889562,
        2.02457309,  0.52422865,  1.96006632,  0.48934587,  1.85501297,
        0.48036933,  1.97956546]), 'std_fit_time': array([ 0.04281144,  0.08071552,  0.02614348,  0.07771937,  0.02550959,
        0.07677181,  0.02669706,  0.05397623,  0.02195435,  0.09297362,
        0.02014624,  0.07601339,  0.01280263,  0.06625402,  0.02128854,
        0.08085693,  0.03622335,  0.0917609 ,  0.00746391,  0.09223856,
        0.0057797 ,  0.08221593,  0.01820333,  0.07249273,  0.02032907,
        0.06914565,  0.01968573,  0.08305312,  0.01823528,  0.06434113,
        0.02078503,  0.19599444]), 'mean_score_time': array([ 0.277

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__activation':('relu','tanh'),
              'clf__solver':('sgd','adam')
}
gs_mlp_clf = GridSearchCV(mlp_text_clf,parameters)
gs_mlp_clf = gs_mlp_clf.fit(twenty_train.data, twenty_train.target)
print(gs_mlp_clf.cv_results_)
print('==========================')
print('MLP 最佳分数：{}'.format(gs_mlp_clf.best_score_))                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_mlp_clf.best_params_[param_name]))

