In [1]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [2]:
train_X = train.data
test_X = test.data
train_y = train.target
test_y = test.target

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

In [4]:
df_accuracy = pd.DataFrame(columns = ["counts", "tf", "tf-idf"], index = ["MultinomialNB", "LinearSVC", "SGDClassifier"])

# MultinomialNB

## counts

In [5]:
nb_count = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

param_grid = {
    'clf__alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'clf__fit_prior': [True, False]
}
nb_count_clf = GridSearchCV(nb_count, param_grid, cv=5, n_jobs=-1)
nb_count_clf = nb_count_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, nb_count_clf.best_params_[param_name]))
    
predicted = nb_count_clf.predict(test_X)
df_accuracy.loc['MultinomialNB', 'counts'] = np.mean(predicted == test_y)

clf__alpha: 0.1
clf__fit_prior: True


In [6]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.79      0.81      0.80       319
           comp.graphics       0.58      0.81      0.68       389
 comp.os.ms-windows.misc       0.50      0.01      0.02       394
comp.sys.ibm.pc.hardware       0.54      0.78      0.64       392
   comp.sys.mac.hardware       0.73      0.84      0.78       385
          comp.windows.x       0.81      0.75      0.78       395
            misc.forsale       0.83      0.84      0.83       390
               rec.autos       0.86      0.92      0.89       396
         rec.motorcycles       0.92      0.96      0.94       398
      rec.sport.baseball       0.94      0.93      0.94       397
        rec.sport.hockey       0.96      0.96      0.96       399
               sci.crypt       0.89      0.93      0.91       396
         sci.electronics       0.78      0.77      0.77       393
                 sci.med       0.90      0.84      0.87       396
         

## tf

In [7]:
>>> from sklearn.feature_extraction.text import TfidfTransformer
nb_tf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB())
])

nb_tf_clf = GridSearchCV(nb_tf, param_grid, cv=5, n_jobs=-1)
nb_tf_clf = nb_tf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, nb_tf_clf.best_params_[param_name]))

predicted = nb_tf_clf.predict(test_X)
df_accuracy.loc['MultinomialNB', 'tf'] = np.mean(predicted == test_y)

clf__alpha: 0.1
clf__fit_prior: False


In [8]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.62      0.70       319
           comp.graphics       0.74      0.73      0.73       389
 comp.os.ms-windows.misc       0.80      0.67      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.79      0.72       392
   comp.sys.mac.hardware       0.86      0.81      0.84       385
          comp.windows.x       0.87      0.78      0.83       395
            misc.forsale       0.89      0.82      0.85       390
               rec.autos       0.85      0.91      0.88       396
         rec.motorcycles       0.90      0.96      0.93       398
      rec.sport.baseball       0.92      0.92      0.92       397
        rec.sport.hockey       0.92      0.96      0.94       399
               sci.crypt       0.77      0.94      0.85       396
         sci.electronics       0.80      0.73      0.77       393
                 sci.med       0.88      0.84      0.86       396
         

## tf-idf

In [9]:
nb_tfidf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

nb_tfidf_clf = GridSearchCV(nb_tfidf, param_grid, cv=5, n_jobs=-1)
nb_tfidf_clf = nb_tfidf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, nb_tfidf_clf.best_params_[param_name]))
predicted = nb_tfidf_clf.predict(test_X)
df_accuracy.loc['MultinomialNB', 'tf-idf'] = np.mean(predicted == test_y)

clf__alpha: 0.1
clf__fit_prior: False


In [10]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.74      0.78       319
           comp.graphics       0.76      0.72      0.74       389
 comp.os.ms-windows.misc       0.78      0.66      0.71       394
comp.sys.ibm.pc.hardware       0.66      0.78      0.71       392
   comp.sys.mac.hardware       0.86      0.84      0.85       385
          comp.windows.x       0.86      0.79      0.83       395
            misc.forsale       0.89      0.76      0.82       390
               rec.autos       0.89      0.91      0.90       396
         rec.motorcycles       0.93      0.97      0.95       398
      rec.sport.baseball       0.94      0.93      0.94       397
        rec.sport.hockey       0.93      0.97      0.95       399
               sci.crypt       0.79      0.95      0.86       396
         sci.electronics       0.82      0.75      0.78       393
                 sci.med       0.90      0.84      0.87       396
         

# SGDClassifier

## count

In [11]:
sgd_count = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])

param_grid = {
    'clf__alpha': [0.001, 0.01]
}

sgd_count_clf = GridSearchCV(sgd_count, param_grid, cv=5, n_jobs=-1)
sgd_count_clf = sgd_count_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, sgd_count_clf.best_params_[param_name]))
    
predicted = sgd_count_clf.predict(test_X)
df_accuracy.loc['SGDClassifier', 'counts'] = np.mean(predicted == test_y)

clf__alpha: 0.001


In [12]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.65      0.70       319
           comp.graphics       0.60      0.75      0.67       389
 comp.os.ms-windows.misc       0.74      0.52      0.61       394
comp.sys.ibm.pc.hardware       0.61      0.69      0.65       392
   comp.sys.mac.hardware       0.84      0.65      0.73       385
          comp.windows.x       0.81      0.72      0.77       395
            misc.forsale       0.74      0.85      0.79       390
               rec.autos       0.87      0.77      0.82       396
         rec.motorcycles       0.81      0.91      0.86       398
      rec.sport.baseball       0.79      0.88      0.83       397
        rec.sport.hockey       0.89      0.93      0.91       399
               sci.crypt       0.83      0.90      0.87       396
         sci.electronics       0.71      0.67      0.69       393
                 sci.med       0.79      0.76      0.77       396
         

## tf

In [13]:
sgd_tf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer(use_idf=False)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])

sgd_tf_clf = GridSearchCV(sgd_tf, param_grid, cv=5, n_jobs=-1)
sgd_tf_clf = sgd_tf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, sgd_tf_clf.best_params_[param_name]))

sgd_tf_clf.fit(train_X, train_y)
predicted = sgd_tf_clf.predict(test_X)
df_accuracy.loc['SGDClassifier', 'tf'] = np.mean(predicted == test_y)

clf__alpha: 0.001


In [14]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.57      0.65       319
           comp.graphics       0.56      0.73      0.63       389
 comp.os.ms-windows.misc       0.69      0.73      0.71       394
comp.sys.ibm.pc.hardware       0.72      0.59      0.65       392
   comp.sys.mac.hardware       0.86      0.70      0.77       385
          comp.windows.x       0.74      0.70      0.72       395
            misc.forsale       0.72      0.92      0.81       390
               rec.autos       0.82      0.87      0.84       396
         rec.motorcycles       0.89      0.91      0.90       398
      rec.sport.baseball       0.82      0.87      0.85       397
        rec.sport.hockey       0.88      0.95      0.91       399
               sci.crypt       0.81      0.93      0.87       396
         sci.electronics       0.81      0.51      0.63       393
                 sci.med       0.79      0.75      0.77       396
         

## tf-idf

In [15]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])

sgd_tfidf_clf = GridSearchCV(sgd_tfidf, param_grid, cv=5, n_jobs=-1)
sgd_tfidf_clf = sgd_tfidf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, sgd_tf_clf.best_params_[param_name]))
    
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
df_accuracy.loc['SGDClassifier', 'tf-idf'] = np.mean(predicted == test_y)

clf__alpha: 0.001


In [16]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.76      0.79       319
           comp.graphics       0.75      0.81      0.78       389
 comp.os.ms-windows.misc       0.76      0.72      0.74       394
comp.sys.ibm.pc.hardware       0.74      0.73      0.74       392
   comp.sys.mac.hardware       0.83      0.86      0.85       385
          comp.windows.x       0.88      0.76      0.82       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.91      0.90      0.91       396
         rec.motorcycles       0.95      0.95      0.95       398
      rec.sport.baseball       0.90      0.96      0.93       397
        rec.sport.hockey       0.96      0.98      0.97       399
               sci.crypt       0.91      0.95      0.93       396
         sci.electronics       0.82      0.80      0.81       393
                 sci.med       0.89      0.88      0.89       396
         

# LinearSVC

## count

In [22]:
svc_count = Pipeline([
    ('vect', CountVectorizer()),
#     ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3, max_iter=50000))),
    ('classification', LinearSVC(penalty="l2", max_iter=50000))
])

param_grid = {
    'classification__C': [0.001, 0.01],
}

svc_count_clf = GridSearchCV(svc_count, param_grid, cv=5, n_jobs=-1)
svc_count_clf = svc_count_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, svc_count_clf.best_params_[param_name]))
    
predicted = svc_count_clf.predict(test_X)
df_accuracy.loc['LinearSVC', 'counts'] = np.mean(predicted == test_y)

classification__C: 0.01


In [23]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.76      0.76       319
           comp.graphics       0.69      0.77      0.73       389
 comp.os.ms-windows.misc       0.74      0.69      0.72       394
comp.sys.ibm.pc.hardware       0.69      0.70      0.69       392
   comp.sys.mac.hardware       0.77      0.82      0.79       385
          comp.windows.x       0.84      0.72      0.77       395
            misc.forsale       0.80      0.91      0.85       390
               rec.autos       0.87      0.86      0.87       396
         rec.motorcycles       0.94      0.94      0.94       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.93      0.96      0.94       399
               sci.crypt       0.92      0.91      0.92       396
         sci.electronics       0.73      0.75      0.74       393
                 sci.med       0.88      0.80      0.84       396
         

## tf

In [24]:
svm_tf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer(use_idf=False)),
#         ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3, max_iter=50000))),
    ('classification', LinearSVC(penalty="l2", max_iter=50000))
])

svc_tf_clf = GridSearchCV(svm_tf, param_grid, cv=5, n_jobs=-1)
svc_tf_clf = svc_tf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, svc_tf_clf.best_params_[param_name]))
    
predicted = svc_tf_clf.predict(test_X)
df_accuracy.loc['LinearSVC', 'tf'] = np.mean(predicted == test_y)

classification__C: 0.01


In [25]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.54      0.42      0.48       319
           comp.graphics       0.60      0.51      0.55       389
 comp.os.ms-windows.misc       0.66      0.64      0.65       394
comp.sys.ibm.pc.hardware       0.67      0.55      0.60       392
   comp.sys.mac.hardware       0.72      0.56      0.63       385
          comp.windows.x       0.64      0.63      0.63       395
            misc.forsale       0.42      0.92      0.58       390
               rec.autos       0.82      0.66      0.73       396
         rec.motorcycles       0.70      0.81      0.75       398
      rec.sport.baseball       0.58      0.72      0.64       397
        rec.sport.hockey       0.71      0.84      0.77       399
               sci.crypt       0.71      0.80      0.75       396
         sci.electronics       0.73      0.25      0.37       393
                 sci.med       0.62      0.49      0.55       396
         

## tf-idf

In [29]:
svm_tfidf = Pipeline([
    ('vect', CountVectorizer()),
    ('tf', TfidfTransformer()),
#         ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3, max_iter=50000))),
    ('classification', LinearSVC(penalty="l2", max_iter=50000))
])
svc_tfidf_clf = GridSearchCV(svm_tfidf, param_grid, cv=5, n_jobs=-1)
svc_tfidf_clf = svc_tfidf_clf.fit(train_X, train_y)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, svc_tfidf_clf.best_params_[param_name]))
predicted = svc_tfidf_clf.predict(test_X)
df_accuracy.loc['LinearSVC', 'tf-idf'] = np.mean(predicted == test_y)

classification__C: 0.01


In [30]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.74      0.61      0.67       319
           comp.graphics       0.73      0.71      0.72       389
 comp.os.ms-windows.misc       0.70      0.80      0.75       394
comp.sys.ibm.pc.hardware       0.72      0.69      0.71       392
   comp.sys.mac.hardware       0.82      0.74      0.77       385
          comp.windows.x       0.78      0.73      0.76       395
            misc.forsale       0.58      0.92      0.71       390
               rec.autos       0.90      0.85      0.88       396
         rec.motorcycles       0.91      0.94      0.92       398
      rec.sport.baseball       0.87      0.88      0.87       397
        rec.sport.hockey       0.88      0.97      0.92       399
               sci.crypt       0.86      0.91      0.88       396
         sci.electronics       0.83      0.53      0.65       393
                 sci.med       0.90      0.72      0.80       396
         

In [31]:
df_accuracy

Unnamed: 0,counts,tf,tf-idf
MultinomialNB,0.80616,0.807886,0.828332
LinearSVC,0.817047,0.62573,0.782926
SGDClassifier,0.752124,0.769782,0.853027


# SGDClassifier with tf-idf

## Lowercasing  == true , stop_words=None is defalut

## Lowercasing == false 

In [32]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(lowercase=False)),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.822623473181094

In [33]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.74      0.73      0.74       319
           comp.graphics       0.76      0.73      0.75       389
 comp.os.ms-windows.misc       0.74      0.75      0.75       394
comp.sys.ibm.pc.hardware       0.72      0.69      0.71       392
   comp.sys.mac.hardware       0.83      0.81      0.82       385
          comp.windows.x       0.85      0.78      0.81       395
            misc.forsale       0.82      0.88      0.85       390
               rec.autos       0.89      0.91      0.90       396
         rec.motorcycles       0.93      0.96      0.94       398
      rec.sport.baseball       0.86      0.90      0.88       397
        rec.sport.hockey       0.86      0.99      0.92       399
               sci.crypt       0.83      0.96      0.89       396
         sci.electronics       0.83      0.61      0.70       393
                 sci.med       0.88      0.85      0.87       396
         

## stop_words

In [34]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8224907063197026

In [35]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.70      0.71       319
           comp.graphics       0.79      0.70      0.74       389
 comp.os.ms-windows.misc       0.73      0.79      0.76       394
comp.sys.ibm.pc.hardware       0.72      0.68      0.70       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.86      0.76      0.81       395
            misc.forsale       0.82      0.87      0.84       390
               rec.autos       0.92      0.89      0.91       396
         rec.motorcycles       0.93      0.97      0.95       398
      rec.sport.baseball       0.89      0.91      0.90       397
        rec.sport.hockey       0.87      0.98      0.92       399
               sci.crypt       0.85      0.96      0.90       396
         sci.electronics       0.81      0.61      0.69       393
                 sci.med       0.90      0.86      0.88       396
         

In [36]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 2), analyzer='word')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8321826872012745

In [37]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.75      0.75       319
           comp.graphics       0.79      0.71      0.75       389
 comp.os.ms-windows.misc       0.75      0.80      0.77       394
comp.sys.ibm.pc.hardware       0.74      0.69      0.71       392
   comp.sys.mac.hardware       0.84      0.82      0.83       385
          comp.windows.x       0.87      0.78      0.82       395
            misc.forsale       0.85      0.86      0.86       390
               rec.autos       0.93      0.89      0.91       396
         rec.motorcycles       0.93      0.97      0.95       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.86      0.99      0.92       399
               sci.crypt       0.81      0.97      0.88       396
         sci.electronics       0.82      0.65      0.72       393
                 sci.med       0.90      0.83      0.86       396
         

In [38]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 3), analyzer='word')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.829660116834838

In [39]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.76      0.75       319
           comp.graphics       0.79      0.72      0.75       389
 comp.os.ms-windows.misc       0.77      0.79      0.78       394
comp.sys.ibm.pc.hardware       0.71      0.69      0.70       392
   comp.sys.mac.hardware       0.83      0.81      0.82       385
          comp.windows.x       0.85      0.77      0.81       395
            misc.forsale       0.85      0.85      0.85       390
               rec.autos       0.92      0.89      0.91       396
         rec.motorcycles       0.94      0.97      0.95       398
      rec.sport.baseball       0.88      0.89      0.88       397
        rec.sport.hockey       0.84      0.98      0.91       399
               sci.crypt       0.80      0.97      0.88       396
         sci.electronics       0.84      0.62      0.71       393
                 sci.med       0.89      0.82      0.85       396
         

In [40]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 1), analyzer='char')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.13502389803505044

In [41]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.14      0.03      0.05       319
           comp.graphics       0.06      0.79      0.11       389
 comp.os.ms-windows.misc       0.13      0.07      0.09       394
comp.sys.ibm.pc.hardware       0.43      0.14      0.21       392
   comp.sys.mac.hardware       0.16      0.02      0.04       385
          comp.windows.x       0.57      0.19      0.28       395
            misc.forsale       0.62      0.56      0.59       390
               rec.autos       0.00      0.00      0.00       396
         rec.motorcycles       0.31      0.10      0.15       398
      rec.sport.baseball       0.34      0.06      0.10       397
        rec.sport.hockey       0.25      0.21      0.22       399
               sci.crypt       0.55      0.03      0.06       396
         sci.electronics       1.00      0.00      0.00       393
                 sci.med       0.08      0.02      0.03       396
         

In [42]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2, 2), analyzer='char')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.6216144450345193

In [43]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.64      0.50      0.56       319
           comp.graphics       0.27      0.68      0.38       389
 comp.os.ms-windows.misc       0.70      0.50      0.58       394
comp.sys.ibm.pc.hardware       0.63      0.51      0.56       392
   comp.sys.mac.hardware       0.72      0.51      0.60       385
          comp.windows.x       0.65      0.90      0.75       395
            misc.forsale       0.69      0.81      0.74       390
               rec.autos       0.56      0.59      0.57       396
         rec.motorcycles       0.81      0.74      0.77       398
      rec.sport.baseball       0.61      0.75      0.67       397
        rec.sport.hockey       0.76      0.80      0.78       399
               sci.crypt       0.65      0.85      0.73       396
         sci.electronics       0.67      0.24      0.35       393
                 sci.med       0.69      0.35      0.47       396
         

In [44]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3, 3), analyzer='char')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.781465746149761

In [45]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.68      0.65      0.67       319
           comp.graphics       0.64      0.76      0.70       389
 comp.os.ms-windows.misc       0.80      0.70      0.74       394
comp.sys.ibm.pc.hardware       0.71      0.65      0.68       392
   comp.sys.mac.hardware       0.80      0.77      0.79       385
          comp.windows.x       0.82      0.85      0.83       395
            misc.forsale       0.81      0.86      0.83       390
               rec.autos       0.85      0.78      0.81       396
         rec.motorcycles       0.91      0.89      0.90       398
      rec.sport.baseball       0.75      0.84      0.80       397
        rec.sport.hockey       0.81      0.96      0.88       399
               sci.crypt       0.78      0.94      0.85       396
         sci.electronics       0.83      0.51      0.63       393
                 sci.med       0.86      0.72      0.79       396
         

In [46]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(4, 4), analyzer='char')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8139936271906533

In [47]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.70      0.71      0.71       319
           comp.graphics       0.67      0.77      0.72       389
 comp.os.ms-windows.misc       0.80      0.73      0.76       394
comp.sys.ibm.pc.hardware       0.80      0.67      0.72       392
   comp.sys.mac.hardware       0.86      0.81      0.84       385
          comp.windows.x       0.84      0.82      0.83       395
            misc.forsale       0.82      0.89      0.85       390
               rec.autos       0.92      0.84      0.88       396
         rec.motorcycles       0.92      0.92      0.92       398
      rec.sport.baseball       0.82      0.89      0.86       397
        rec.sport.hockey       0.86      0.97      0.91       399
               sci.crypt       0.79      0.96      0.86       396
         sci.electronics       0.84      0.59      0.69       393
                 sci.med       0.87      0.81      0.84       396
         

In [48]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3, 4), analyzer='char')),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8098778544875199

In [49]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.70      0.71       319
           comp.graphics       0.69      0.78      0.73       389
 comp.os.ms-windows.misc       0.80      0.72      0.76       394
comp.sys.ibm.pc.hardware       0.73      0.68      0.71       392
   comp.sys.mac.hardware       0.87      0.80      0.83       385
          comp.windows.x       0.83      0.85      0.84       395
            misc.forsale       0.82      0.88      0.85       390
               rec.autos       0.94      0.80      0.87       396
         rec.motorcycles       0.92      0.92      0.92       398
      rec.sport.baseball       0.81      0.88      0.84       397
        rec.sport.hockey       0.85      0.98      0.91       399
               sci.crypt       0.77      0.95      0.85       396
         sci.electronics       0.85      0.60      0.70       393
                 sci.med       0.88      0.79      0.83       396
         

In [50]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(max_features = 1000)),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.6479022835900159

In [51]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.59      0.51      0.55       319
           comp.graphics       0.53      0.58      0.55       389
 comp.os.ms-windows.misc       0.60      0.67      0.63       394
comp.sys.ibm.pc.hardware       0.53      0.45      0.49       392
   comp.sys.mac.hardware       0.59      0.61      0.60       385
          comp.windows.x       0.65      0.60      0.62       395
            misc.forsale       0.70      0.85      0.77       390
               rec.autos       0.72      0.70      0.71       396
         rec.motorcycles       0.74      0.80      0.77       398
      rec.sport.baseball       0.66      0.72      0.69       397
        rec.sport.hockey       0.73      0.84      0.78       399
               sci.crypt       0.70      0.83      0.76       396
         sci.electronics       0.53      0.30      0.39       393
                 sci.med       0.66      0.57      0.61       396
         

In [53]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(max_features = 5000)),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.7789431757833245

In [54]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.68      0.65      0.66       319
           comp.graphics       0.70      0.65      0.67       389
 comp.os.ms-windows.misc       0.69      0.75      0.72       394
comp.sys.ibm.pc.hardware       0.69      0.60      0.64       392
   comp.sys.mac.hardware       0.75      0.79      0.77       385
          comp.windows.x       0.78      0.74      0.76       395
            misc.forsale       0.79      0.88      0.84       390
               rec.autos       0.85      0.84      0.84       396
         rec.motorcycles       0.87      0.91      0.89       398
      rec.sport.baseball       0.85      0.88      0.86       397
        rec.sport.hockey       0.86      0.97      0.91       399
               sci.crypt       0.85      0.95      0.90       396
         sci.electronics       0.76      0.52      0.62       393
                 sci.med       0.83      0.74      0.78       396
         

In [55]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000)),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8056293149229952

In [56]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.70      0.67      0.69       319
           comp.graphics       0.76      0.71      0.73       389
 comp.os.ms-windows.misc       0.71      0.76      0.73       394
comp.sys.ibm.pc.hardware       0.73      0.63      0.67       392
   comp.sys.mac.hardware       0.78      0.81      0.80       385
          comp.windows.x       0.81      0.75      0.78       395
            misc.forsale       0.81      0.90      0.86       390
               rec.autos       0.89      0.87      0.88       396
         rec.motorcycles       0.89      0.94      0.92       398
      rec.sport.baseball       0.88      0.89      0.89       397
        rec.sport.hockey       0.87      0.99      0.93       399
               sci.crypt       0.84      0.95      0.89       396
         sci.electronics       0.80      0.56      0.66       393
                 sci.med       0.85      0.84      0.84       396
         

In [57]:
sgd_tfidf = Pipeline([
    ('vect', CountVectorizer(max_features = 20000)),
    ('tf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=.001,loss='hinge', penalty='l2', random_state=42,max_iter=5, tol=None))
])
sgd_tfidf.fit(train_X, train_y)
predicted = sgd_tfidf.predict(test_X)
np.mean(predicted == test_y)

0.8159851301115242

In [58]:
>>> print(metrics.classification_report(test_y, predicted,
...     target_names=test.target_names, zero_division=1))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.69      0.70       319
           comp.graphics       0.76      0.71      0.74       389
 comp.os.ms-windows.misc       0.72      0.76      0.74       394
comp.sys.ibm.pc.hardware       0.74      0.66      0.70       392
   comp.sys.mac.hardware       0.79      0.82      0.81       385
          comp.windows.x       0.83      0.75      0.79       395
            misc.forsale       0.83      0.90      0.86       390
               rec.autos       0.89      0.88      0.88       396
         rec.motorcycles       0.91      0.96      0.93       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.88      0.98      0.93       399
               sci.crypt       0.83      0.96      0.89       396
         sci.electronics       0.83      0.59      0.69       393
                 sci.med       0.88      0.85      0.86       396
         