## Data loading

In [17]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer

categories = None
remove = ()

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

print('data loaded')

# order of labels in `target_names` can be different from `categories`
target_names = data_train.target_names


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print()


Loading 20 newsgroups dataset for categories:
all
data loaded
11314 documents - 22.055MB (training set)
7532 documents - 13.801MB (test set)



### Feature extraction

In [20]:
from time import time

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()

vectorizer = HashingVectorizer(
    stop_words='english', non_negative=True,
    n_features=2**16
)
X_train = vectorizer.transform(data_train.data)

duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()


Extracting features from the training data using a sparse vectorizer
done in 2.946253s at 7.486MB/s
n_samples: 11314, n_features: 65536

Extracting features from the test data using the same vectorizer
done in 1.917940s at 7.196MB/s
n_samples: 7532, n_features: 65536



### Feature selection


In [27]:
from sklearn.feature_selection import SelectKBest, chi2

feature_names = None

# print("Extracting %d best features by a chi-squared test" % opts.select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k=opts.select_chi2)
# X_train = ch2.fit_transform(X_train, y_train)
# X_test = ch2.transform(X_test)
    
# print("done in %fs" % (time() - t0))
# print()

### Benchmarks


In [28]:
from sklearn import metrics
from sklearn.utils.extmath import density

def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    print()
    
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


In [29]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  ('classification', LinearSVC())
])))

# make some plots

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time",
         color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='lsqr',
        tol=0.01)




train time: 6.754s
test time:  0.029s
accuracy:   0.837
dimensionality: 65536
density: 0.859970

classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.77      0.76      0.77       319
           comp.graphics       0.73      0.78      0.76       389
 comp.os.ms-windows.misc       0.73      0.74      0.74       394
comp.sys.ibm.pc.hardware       0.71      0.76      0.73       392
   comp.sys.mac.hardware       0.81      0.84      0.82       385
          comp.windows.x       0.87      0.73      0.80       395
            misc.forsale       0.83      0.89      0.86       390
               rec.autos       0.91      0.90      0.91       396
         rec.motorcycles       0.96      0.94      0.95       398
      rec.sport.baseball       0.88      0.93      0.91       397
        rec.sport.hockey       0.93      0.96      0.95       399
               sci.crypt       0.94      0.93      0.94       396
         sci.electron



train time: 3.943s
test time:  0.016s
accuracy:   0.834
dimensionality: 65536
density: 0.859970

classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.77      0.78       319
           comp.graphics       0.74      0.79      0.76       389
 comp.os.ms-windows.misc       0.74      0.72      0.73       394
comp.sys.ibm.pc.hardware       0.70      0.73      0.71       392
   comp.sys.mac.hardware       0.80      0.83      0.81       385
          comp.windows.x       0.86      0.74      0.80       395
            misc.forsale       0.80      0.89      0.84       390
               rec.autos       0.90      0.88      0.89       396
         rec.motorcycles       0.95      0.94      0.94       398
      rec.sport.baseball       0.89      0.92      0.90       397
        rec.sport.hockey       0.93      0.97      0.95       399
               sci.crypt       0.94      0.93      0.93       396
         sci.electron



train time: 11.551s
test time:  0.016s
accuracy:   0.799
dimensionality: 65536
density: 0.005753

classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.75      0.71      0.73       319
           comp.graphics       0.72      0.75      0.73       389
 comp.os.ms-windows.misc       0.72      0.71      0.71       394
comp.sys.ibm.pc.hardware       0.66      0.71      0.68       392
   comp.sys.mac.hardware       0.77      0.80      0.79       385
          comp.windows.x       0.84      0.71      0.77       395
            misc.forsale       0.80      0.86      0.82       390
               rec.autos       0.84      0.86      0.85       396
         rec.motorcycles       0.91      0.92      0.91       398
      rec.sport.baseball       0.86      0.89      0.87       397
        rec.sport.hockey       0.93      0.95      0.94       399
               sci.crypt       0.91      0.91      0.91       396
         sci.electro



train time: 12.852s
test time:  0.040s
accuracy:   0.808
classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.74      0.74      0.74       319
           comp.graphics       0.70      0.75      0.72       389
 comp.os.ms-windows.misc       0.74      0.72      0.73       394
comp.sys.ibm.pc.hardware       0.70      0.70      0.70       392
   comp.sys.mac.hardware       0.77      0.79      0.78       385
          comp.windows.x       0.83      0.72      0.77       395
            misc.forsale       0.78      0.88      0.83       390
               rec.autos       0.87      0.86      0.86       396
         rec.motorcycles       0.92      0.92      0.92       398
      rec.sport.baseball       0.88      0.89      0.88       397
        rec.sport.hockey       0.92      0.95      0.94       399
               sci.crypt       0.92      0.91      0.91       396
         sci.electronics       0.70      0.71      0.70      



NameError: name 'np' is not defined