In [1]:
# First let's get the data 
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset = 'train')
dir(texts)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [11]:
# Define  a get mini batch that returns just a little of the data
BATCH_SIZE = 32
def get_batch(data_idx, texts):
    data_x, data_y = [],[]
    for t in range(BATCH_SIZE):
        if data_idx == len(texts.data):
            return [], [], 0
        data_x.append(texts.data[data_idx])
        data_y.append(texts.target[data_idx])
        data_idx += 1
    return data_x, data_y, data_idx
# start with index 0
data_idx = 0

In [29]:
import numpy as np

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB

vectorizer = HashingVectorizer(decode_error = 'ignore', n_features = 2 ** 18,alternate_sign=False)
all_classes = np.array(range(20))



In [6]:
# a try four different classifiers
partial_fit_classifiers = {
    'SGD' :SGDClassifier(),
    'Perception': Perceptron(),
    'NB Multinomial': MultinomialNB(alpha=0.01), #ignores non-occuring features
    'Passive-Aggreasive': PassiveAggressiveClassifier(), # for large-scale learnig
} 

In [15]:
# hold out a number of examples to estimate accuracy
n_test_documents = 3 * BATCH_SIZE
X_test_text = []
y_test = []

for i in range(3):
    _X_test_text, _y_test, data_idx = get_batch(data_idx, texts)
    X_test_text.extend(_X_test_text)
    y_test.extend((_y_test))
    
print(f"X_test_text: {X_test_text}")
print(f"y_test: {y_test}")
print(f"Final data_idx: {data_idx}")

y_test: [15, 6, 5, 10, 8, 4, 16, 17, 9, 8, 10, 17, 10, 11, 8, 15, 10, 4, 12, 0, 5, 4, 0, 18, 17, 2, 3, 0, 19, 19, 9, 19, 5, 13, 14, 10, 5, 11, 6, 6, 1, 13, 13, 18, 17, 2, 9, 15, 2, 8, 9, 2, 19, 14, 5, 14, 15, 9, 3, 18, 1, 18, 10, 13, 19, 18, 15, 9, 14, 4, 4, 0, 12, 18, 14, 14, 17, 13, 15, 5, 4, 18, 12, 9, 6, 1, 4, 6, 8, 0, 17, 3, 1, 18, 16, 18]
Final data_idx: 384


In [17]:
# Convert the test data into features
X_test = vectorizer.transform(X_test_text)
print(f"X_test: {X_test}")
print(f"y_test: {y_test}")
print(f"Final data_idx: {data_idx}")

X_test:   (0, 50)	0.030303030303030304
  (0, 1903)	-0.030303030303030304
  (0, 2695)	0.030303030303030304
  (0, 4412)	-0.06060606060606061
  (0, 5862)	0.030303030303030304
  (0, 6599)	-0.030303030303030304
  (0, 7190)	-0.06060606060606061
  (0, 11438)	0.030303030303030304
  (0, 12125)	0.030303030303030304
  (0, 12136)	-0.030303030303030304
  (0, 13346)	-0.030303030303030304
  (0, 14708)	0.030303030303030304
  (0, 14870)	-0.030303030303030304
  (0, 15860)	0.06060606060606061
  (0, 18899)	-0.030303030303030304
  (0, 20348)	0.030303030303030304
  (0, 22784)	-0.030303030303030304
  (0, 23579)	-0.06060606060606061
  (0, 24734)	-0.42424242424242425
  (0, 28331)	0.06060606060606061
  (0, 29457)	-0.030303030303030304
  (0, 29717)	0.030303030303030304
  (0, 32657)	-0.030303030303030304
  (0, 36057)	-0.030303030303030304
  (0, 36459)	-0.06060606060606061
  :	:
  (95, 194851)	-0.05783149319662402
  (95, 194939)	0.05783149319662402
  (95, 195726)	0.23132597278649608
  (95, 198838)	0.11566298639324

In [18]:
print("Sparse matrix shape:", X_test.shape)
print("Sparse matrix type:", type(X_test))
print("Sample data (dense representation):", X_test[0].todense())


Sparse matrix shape: (96, 262144)
Sparse matrix type: <class 'scipy.sparse._csr.csr_matrix'>
Sample data (dense representation): [[0. 0. 0. ... 0. 0. 0.]]


In [26]:
# Create some structure to store statistics
def progress(cls_name, stats):
    """Report progress information return a string."""
    s = "%20s classifier : \t" % cls_name
    s += "accuracy: %(accuracy).3f " % stats
    return s
cls_stats = {}
for cls_name in partial_fit_classifiers: 
    stats = {'n_train' : 0, 'n_train_pos' : 0,
                'accuracy' : 0.0, 'accuracy_history' : [(0, 0)]}
    cls_stats[cls_name] = stats

    loop_count = 0

In [30]:
# Main loop : iterate on mini-batche of examples
X_train_test, y_train, data_idx = get_batch(data_idx, texts)
while len(X_train_test) > 0:
    loop_count += 1
    X_train = vectorizer.transform(X_train_test)

    for cls_name, cls in partial_fit_classifiers.items():
        # update estimator with examples in the current mini-batch
        cls.partial_fit(X_train, y_train, classes = all_classes)

        # Accumulatae test accuracy states
        cls_stats[cls_name]['n_train'] += X_train.shape[0]
        cls_stats[cls_name]['n_train_pos'] += sum(y_train)
        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
        acc_history = (cls_stats[cls_name]['accuracy'],
                        cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['accuracy_history'].append(acc_history)

        if loop_count % 30 == 0:
            print(progress(cls_name, cls_stats[cls_name]))
    if loop_count % 30 == 0:
        print('\n')
    X_train_text, y_train, data_idx = get_batch(data_idx, texts)

                 SGD classifier : 	accuracy: 0.115 
          Perception classifier : 	accuracy: 0.021 
      NB Multinomial classifier : 	accuracy: 0.062 
  Passive-Aggreasive classifier : 	accuracy: 0.010 


                 SGD classifier : 	accuracy: 0.021 
          Perception classifier : 	accuracy: 0.062 
      NB Multinomial classifier : 	accuracy: 0.031 
  Passive-Aggreasive classifier : 	accuracy: 0.094 


                 SGD classifier : 	accuracy: 0.104 
          Perception classifier : 	accuracy: 0.042 
      NB Multinomial classifier : 	accuracy: 0.052 
  Passive-Aggreasive classifier : 	accuracy: 0.062 


                 SGD classifier : 	accuracy: 0.031 
          Perception classifier : 	accuracy: 0.094 
      NB Multinomial classifier : 	accuracy: 0.010 
  Passive-Aggreasive classifier : 	accuracy: 0.062 


                 SGD classifier : 	accuracy: 0.052 
          Perception classifier : 	accuracy: 0.042 
      NB Multinomial classifier : 	accuracy: 0.010 
  Pa

KeyboardInterrupt: 

In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [38]:
def plot_accuracy(x, y, x_legend):
    """Plot accuracy as a function of x."""
    x = np.array(x)
    y = np.array(y)
    plt.title('Classification accuracy as a function as a function of %s' % x_legend)
    plt.xlabel('%s' % x_legend)
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.plot(x, y)
    rcParams['legend.fontsize'] = 10
    cls_names = list(sorted(cls_stats.keys()))

    # plot accuracy evolution
    plt.figure()
    for _ , stats in sorted (cls_stats.items()):
        accuracy, n_examples = zip(*stats['accuracy_history'])
        plot_accuracy(n_examples, accuracy, "training examples (#)")
        ax = plt.gca()
        ax.set_ylim((0.8 ,1))
    plt.legend(cls_names, loc='best')