In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test  = fetch_20newsgroups(subset='test')

In [None]:
newsgroups_train.target_names

In [None]:
newsgroups_train.target.size, newsgroups_test.target.size

In [None]:
print(newsgroups_train.data[3])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test  = vectorizer.transform(newsgroups_test.data)

In [None]:
y_train = newsgroups_train.target
y_test  = newsgroups_test.target

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
X_train[0]

In [None]:
X_train_0 = X_train[0].toarray()
X_train_0

In [None]:
np.count_nonzero(X_train_0), X_train_0.shape[1]

In [None]:
X_train_0[np.nonzero(X_train_0)]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(np.sort(X_train_0[np.nonzero(X_train_0)]))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

pipe = Pipeline([('bin', Binarizer()), 
                 ('clf', LogisticRegression())])

In [None]:
from sklearn.model_selection import GridSearchCV

param = {'bin__threshold': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4]}

gs1 = GridSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs1.fit(X_train, y_train)

In [None]:
gs1.best_params_, gs1.best_score_, gs1.best_estimator_

In [None]:
gs1.score(X_test, y_test)

In [None]:
plt.errorbar(gs1.cv_results_['param_bin__threshold'].data, 
             gs1.cv_results_['mean_train_score'], 
             yerr=gs1.cv_results_['std_train_score'],
             label="training")

plt.errorbar(gs1.cv_results_['param_bin__threshold'].data,
             gs1.cv_results_['mean_test_score'], 
             yerr=gs1.cv_results_['std_test_score'],
             label="test(val)")

plt.ylim(0, 1.01)
plt.xlabel("threshold")
plt.ylabel("accuracy")
plt.legend(loc="best");

In [None]:
pipe = Pipeline([('bin', Binarizer()), 
                 ('clf', LogisticRegression())])

param = {'bin__threshold': [0.001, 0.01, 0.05],
         'clf__C': 10**np.arange(1.0, 10.0) }

from sklearn.model_selection import RandomizedSearchCV

gs11 = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs11.fit(X_train, y_train)

In [None]:
gs11.best_params_, gs11.best_score_, gs11.best_estimator_

In [None]:
gs11.score(X_test, y_test)

In [None]:
from sklearn.svm import LinearSVC

pipe = Pipeline([('bin', Binarizer()), 
                 ('clf', LinearSVC())])

param = {'bin__threshold': [0.001, 0.01, 0.05],
         'clf__C': 10**np.arange(1.0, 10.0) }

from sklearn.model_selection import RandomizedSearchCV

gs2 = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_, gs2.best_score_, gs2.best_estimator_

In [None]:
gs2.score(X_test, y_test)

In [None]:
from sklearn.linear_model import SGDClassifier

pipe = Pipeline([('bin', Binarizer()), 
                 ('clf', SGDClassifier(loss="hinge") )])

param = {'bin__threshold': [0.001, 0.01, 0.05],
         'clf__alpha': 10**np.arange(-10.0, -1.0) }

from sklearn.model_selection import RandomizedSearchCV

gs22 = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs22.fit(X_train, y_train)

In [None]:
gs22.best_params_, gs22.best_score_, gs22.best_estimator_

In [None]:
gs22.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = gs22.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, digits=4))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.gray()

plt.imshow(1- conf_mat / conf_mat.sum(axis=1),
           interpolation='nearest')

plt.yticks(range(20), newsgroups_train.target_names);
plt.xticks(range(20), newsgroups_train.target_names, rotation=90);