# TextClassification.ipynb   文本分类

In [24]:
import numpy as np
import sklearn.datasets as sd
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.metrics as sm
import sklearn.feature_extraction.text as ft

In [25]:
# 加载数据文件
data = sd.load_files('../data/20news', shuffle=True, encoding='latin1', random_state=7)
print(data.data[0])
print(data.target[0])
# print(data.target_names[4])
samples, targets = data.data, data.target

From: gene@theporch.raider.net (Gene Wright)
Subject: NASA Special Publications for Voyager Mission?
Organization: The MacInteresteds of Nashville, Tn.
Lines: 12

I have two books, both NASA Special Publications, on the Voyager 
Missions. One is titled "Voyages to Jupiter" the other "Voyage to Saturn" 
These were excellent books put together after the encounters with each 
planet. 

The question is: Did NASA ever put together a similar book for either the 
Uranus encounter or Neptune? If so, what SP number is it and where can it 
be obtained? If not, why didn't they?

--
  gene@theporch.raider.net (Gene Wright)
theporch.raider.net  615/297-7951 The MacInteresteds of Nashville

4


In [26]:
# 1. 整理 tfidf矩阵
cv = ft.CountVectorizer()
bow = cv.fit_transform(samples)
tt = ft.TfidfTransformer()
x = tt.fit_transform(bow).toarray()
print(x.shape)
# 2. 拆分测试集训练集
train_x, test_x, train_y, test_y = \
    ms.train_test_split(x, targets, test_size=0.1, random_state=7)
# 3. 训练模型
model = lm.LogisticRegression()
# import sklearn.ensemble as se
# model = se.RandomForestClassifier(max_depth=50, n_estimators=100, min_samples_split=10)

# 训练基于多项分布的朴素贝叶斯模型
import sklearn.naive_bayes as nb
model = nb.MultinomialNB()
# model = nb.GaussianNB()

score = ms.cross_val_score(model, x, targets, cv=5, scoring='f1_weighted')
print(score.mean())
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
print(sm.confusion_matrix(test_y, pred_test_y))


(2968, 40605)
0.9446047813386164
             precision    recall  f1-score   support

          0       0.98      0.79      0.88        72
          1       0.93      0.98      0.95        53
          2       0.88      1.00      0.94        45
          3       0.93      1.00      0.96        66
          4       1.00      1.00      1.00        61

avg / total       0.95      0.95      0.94       297

[[57  4  6  5  0]
 [ 1 52  0  0  0]
 [ 0  0 45  0  0]
 [ 0  0  0 66  0]
 [ 0  0  0  0 61]]


In [27]:
# 4. 测试模型
test_data = [
    'The curveballs of right handed pitchers tend to curve to the left',
    'Caesar cipher is an ancient form of encryption',
    'This two-wheeler is really good on slippery roads',
    "Harley heard it's cool, isn't it cheap?"]
# 5. 应用模型   想办法把测试文本变为与训练样本结构相同的tfidf
bow = cv.transform(test_data)
test_x = tt.transform(bow).toarray()
print(test_x.shape)
pred_test_y = model.predict(test_x)
target_names = np.array(data.target_names)
print(pred_test_y, target_names[pred_test_y])

(4, 40605)
[2 3 1 1] ['rec.sport.baseball' 'sci.crypt' 'rec.motorcycles' 'rec.motorcycles']
