In [1]:
# 加载文本分类数据集
from sklearn.datasets import fetch_20newsgroups
import random

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

print("sample several datas: ")
print("X_train: ", X_train[0: 2])
print("Y_train:", y_train[0: 2])

sample several datas: 
X_train:  ["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: U

In [2]:
#  提取文本TF-IDF数据特征
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with", str(np.array(X_train).shape[1]),"features")
    return X_train, X_test

X_train,  X_test = TFIDF(X_train, X_test)

tf-idf with 75000 features


In [None]:
# 使用PCA将文本特征降纬
from sklearn.decomposition import PCA
pca = PCA(n_components=2000)
X_train_new = pca.fit_transform(X_train)
X_test_new = pca.transform(X_test)

print("train with old features: ", np.array(X_train).shape)
print("train with new features:", np.array(X_train_new).shape)

print("test with old features: ", np.array(X_test).shape)
print("test with new features:", np.array(X_test_new).shape)

In [None]:
# 使用LDA将数据降纬
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis(n_components=15)
X_train_new = LDA.fit(X_train, y_train)
X_train_new = LDA.transform(X_train)
X_test_new = LDA.transform(X_test)

print("train with old features: ", np.array(X_train).shape)
print("train with new features:", np.array(X_train_new).shape)

print("test with old features: ", np.array(X_test).shape)
print("test with new features:", np.array(X_test_new).shape)

In [None]:
# 使用NMF将数据降纬
from sklearn.decomposition import NMF
NMF_ = NMF(n_components=2000)
X_train_new = NMF_.fit(X_train)
X_train_new = NMF_.transform(X_train)
X_test_new = NMF_.transform(X_test)

print("train with old features: ", np.array(X_train).shape)
print("train with new features:", np.array(X_train_new).shape)

print("test with old features: ", np.array(X_test).shape)
print("test with new features:", np.array(X_test_new))

In [None]:
# 使用random projection将数据降纬
from sklearn import random_projection

RandomProjection = random_projection.GaussianRandomProjection(n_components=2000)
X_train_new = RandomProjection.fit_transform(X_train)
X_test_new = RandomProjection.transform(X_test)

print("train with old features: ", np.array(X_train).shape)
print("train with new features:", np.array(X_train_new).shape)

print("test with old features: ", np.array(X_test).shape)
print("test with new features:", np.array(X_test_new).shape)


In [3]:
# about T-SNE
import numpy as np
from sklearn.manifold import TSNE
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
X_embedded = TSNE(n_components=2).fit_transform(X)
print(X_embedded.shape)

(4, 2)


In [4]:
# Rocchio classification
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', NearestCentroid()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.75      0.49      0.60       319
          1       0.44      0.76      0.56       389
          2       0.75      0.68      0.71       394
          3       0.71      0.59      0.65       392
          4       0.81      0.71      0.76       385
          5       0.83      0.66      0.74       395
          6       0.49      0.88      0.63       390
          7       0.86      0.76      0.80       396
          8       0.91      0.86      0.89       398
          9       0.85      0.79      0.82       397
         10       0.95      0.80      0.87       399
         11       0.94      0.66      0.78       396
         12       0.40      0.70      0.51       393
         13       0.84      0.49      0.62       396
         14       0.89      0.72      0.80       394
         15       0.55      0.73      0.63       398
         16       0.68      0.76      0.71       364
         17       0.97      0.70      0.81   

In [5]:
# boosting classification
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

  from numpy.core.umath_tests import inner1d


             precision    recall  f1-score   support

          0       0.81      0.66      0.73       319
          1       0.68      0.70      0.69       389
          2       0.71      0.70      0.70       394
          3       0.65      0.71      0.68       392
          4       0.79      0.79      0.79       385
          5       0.83      0.64      0.72       395
          6       0.81      0.85      0.83       390
          7       0.86      0.74      0.79       396
          8       0.90      0.86      0.88       398
          9       0.91      0.85      0.88       397
         10       0.93      0.86      0.90       399
         11       0.91      0.81      0.86       396
         12       0.33      0.68      0.44       393
         13       0.86      0.71      0.78       396
         14       0.86      0.84      0.85       394
         15       0.85      0.88      0.87       398
         16       0.65      0.79      0.71       364
         17       0.96      0.75      0.84   

In [6]:
# bagging classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BaggingClassifier(KNeighborsClassifier())),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.59      0.71      0.64       319
          1       0.59      0.56      0.57       389
          2       0.59      0.57      0.58       394
          3       0.58      0.57      0.58       392
          4       0.60      0.55      0.57       385
          5       0.74      0.63      0.68       395
          6       0.60      0.47      0.53       390
          7       0.77      0.71      0.74       396
          8       0.84      0.82      0.83       398
          9       0.76      0.75      0.76       397
         10       0.82      0.88      0.85       399
         11       0.74      0.84      0.78       396
         12       0.67      0.53      0.59       393
         13       0.76      0.51      0.61       396
         14       0.78      0.79      0.78       394
         15       0.72      0.78      0.75       398
         16       0.71      0.76      0.74       364
         17       0.61      0.79      0.69   

In [7]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.80      0.52      0.63       319
          1       0.81      0.65      0.72       389
          2       0.82      0.65      0.73       394
          3       0.67      0.78      0.72       392
          4       0.86      0.77      0.81       385
          5       0.89      0.75      0.82       395
          6       0.93      0.69      0.80       390
          7       0.85      0.92      0.88       396
          8       0.94      0.93      0.93       398
          9       0.92      0.90      0.91       397
         10       0.89      0.97      0.93       399
         11       0.59      0.97      0.74       396
         12       0.84      0.60      0.70       393
         13       0.92      0.74      0.82       396
         14       0.84      0.89      0.87       394
         15       0.44      0.98      0.61       398
         16       0.64      0.94      0.76       364
         17       0.93      0.91      0.92   

In [8]:
# K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.43      0.76      0.55       319
          1       0.50      0.61      0.55       389
          2       0.56      0.57      0.57       394
          3       0.53      0.58      0.56       392
          4       0.59      0.56      0.57       385
          5       0.69      0.60      0.64       395
          6       0.58      0.45      0.51       390
          7       0.75      0.69      0.72       396
          8       0.84      0.81      0.82       398
          9       0.77      0.72      0.74       397
         10       0.85      0.84      0.84       399
         11       0.76      0.84      0.80       396
         12       0.70      0.50      0.58       393
         13       0.82      0.49      0.62       396
         14       0.79      0.76      0.78       394
         15       0.75      0.76      0.76       398
         16       0.70      0.73      0.72       364
         17       0.62      0.76      0.69   

In [9]:
# Support Vector Machine (SVM)
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.82      0.80      0.81       319
          1       0.76      0.80      0.78       389
          2       0.77      0.73      0.75       394
          3       0.71      0.76      0.74       392
          4       0.84      0.86      0.85       385
          5       0.87      0.76      0.81       395
          6       0.83      0.91      0.87       390
          7       0.92      0.91      0.91       396
          8       0.95      0.95      0.95       398
          9       0.92      0.95      0.93       397
         10       0.96      0.98      0.97       399
         11       0.93      0.94      0.93       396
         12       0.81      0.79      0.80       393
         13       0.90      0.87      0.88       396
         14       0.90      0.93      0.92       394
         15       0.84      0.93      0.88       398
         16       0.75      0.92      0.82       364
         17       0.97      0.89      0.93   

In [10]:
# Decision Tree
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.49      0.49      0.49       319
          1       0.40      0.41      0.41       389
          2       0.50      0.56      0.53       394
          3       0.46      0.41      0.43       392
          4       0.52      0.57      0.54       385
          5       0.48      0.47      0.48       395
          6       0.68      0.72      0.70       390
          7       0.62      0.58      0.60       396
          8       0.72      0.76      0.74       398
          9       0.52      0.56      0.54       397
         10       0.66      0.66      0.66       399
         11       0.78      0.70      0.74       396
         12       0.34      0.35      0.35       393
         13       0.49      0.42      0.45       396
         14       0.66      0.62      0.64       394
         15       0.70      0.69      0.70       398
         16       0.47      0.61      0.53       364
         17       0.78      0.59      0.67   

In [11]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.72      0.65      0.68       319
          1       0.56      0.69      0.62       389
          2       0.64      0.78      0.70       394
          3       0.63      0.64      0.64       392
          4       0.77      0.74      0.76       385
          5       0.76      0.66      0.71       395
          6       0.75      0.92      0.83       390
          7       0.80      0.80      0.80       396
          8       0.89      0.90      0.89       398
          9       0.78      0.91      0.84       397
         10       0.91      0.92      0.92       399
         11       0.88      0.92      0.90       396
         12       0.68      0.48      0.57       393
         13       0.84      0.66      0.74       396
         14       0.82      0.90      0.86       394
         15       0.68      0.93      0.78       398
         16       0.68      0.87      0.76       364
         17       0.95      0.82      0.88   