# TextClassification.ipynb   文本分类

In [21]:
import numpy as np
import sklearn.datasets as sd
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.metrics as sm
import sklearn.feature_extraction.text as ft

In [22]:
# 加载数据文件
data = sd.load_files('../data/20news', shuffle=True, encoding='latin1', random_state=7)
print(data.data[0])
print(data.target[0])
# print(data.target_names[4])
samples, targets = data.data, data.target

the long kiss goodnight ( r ) meryl streep tried it and failed . 
even pamela anderson lee made an attempt but fell flat on her well-bared assets . 
however , geena davis could very well become the first bankable american female action star with the long kiss goodnight , a preposterous but incredibly fun action thriller directed by her husband , renny harlin . 
davis plays samantha caine , a mousy suburban school teacher and mother whose memories only go back eight years . 
with the help of ethically questionable private detective mitch hennessey ( samuel l . jackson ) , she slowly remembers--and reclaims--her past as charly baltimore , tough-as-nails cia operative . 
needless to say , with the reappearance of samantha/charly comes the appearance of an assortment of no-goodniks out to erase more than her memory . 
shane black netted a cool $4 mil for his script ; i'm not so sure if his prose is truly deserving of such a hefty price tag , but for what it's worth , it delivers the goods 

In [23]:
# 1. 整理 tfidf矩阵
cv = ft.CountVectorizer()
bow = cv.fit_transform(samples)
tt = ft.TfidfTransformer()
x = tt.fit_transform(bow).toarray()
print(x.shape)
# 2. 拆分测试集训练集
train_x, test_x, train_y, test_y = \
    ms.train_test_split(x, targets, test_size=0.1, random_state=7)
# 3. 训练模型
model = lm.LogisticRegression()
# import sklearn.ensemble as se
# model = se.RandomForestClassifier(max_depth=50, n_estimators=100, min_samples_split=10)

# 训练基于多项分布的朴素贝叶斯模型
import sklearn.naive_bayes as nb
model = nb.MultinomialNB()
# model = nb.GaussianNB()

score = ms.cross_val_score(model, x, targets, cv=5, scoring='f1_weighted')
print(score.mean())
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
print(sm.confusion_matrix(test_y, pred_test_y))


(2000, 39659)
0.8115951117712289
             precision    recall  f1-score   support

          0       0.72      0.87      0.79        90
          1       0.87      0.73      0.79       110

avg / total       0.80      0.79      0.79       200

[[78 12]
 [30 80]]


In [15]:
# 4. 测试模型
test_data = [
    'The curveballs of right handed pitchers tend to curve to the left',
    'Caesar cipher is an ancient form of encryption',
    'This two-wheeler is really good on slippery roads',
    "Harley heard it's cool, isn't it cheap?"]
# 5. 应用模型   想办法把测试文本变为与训练样本结构相同的tfidf
bow = cv.transform(test_data)
test_x = tt.transform(bow).toarray()
print(test_x.shape)
pred_test_y = model.predict(test_x)
target_names = np.array(data.target_names)
print(pred_test_y, target_names[pred_test_y])

(4, 40605)
[2 3 1 1] ['rec.sport.baseball' 'sci.crypt' 'rec.motorcycles' 'rec.motorcycles']
