In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [4]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
print(twenty_train.data[3])

From: jgreen@amber (Joe Green)
Subject: Re: Weitek P9000 ?
Organization: Harris Computer Systems Division
Lines: 14
Distribution: world
NNTP-Posting-Host: amber.ssd.csd.harris.com
X-Newsreader: TIN [version 1.1 PL9]

Robert J.C. Kyanko (rob@rjck.UUCP) wrote:
> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:
> > Anyone know about the Weitek P9000 graphics chip?
> As far as the low-level stuff goes, it looks pretty nice.  It's got this
> quadrilateral fill command that requires just the four points.

Do you have Weitek's address/phone number?  I'd like to get some information
about this chip.

--
Joe Green				Harris Corporation
jgreen@csd.harris.com			Computer Systems Division
"The only thing that really scares me is a person with no sense of humor."
						-- Jonathan Winters



In [6]:
from sklearn.feature_extraction.text import CountVectorizer
#converting a collection of text to a matrix of token counts
count_vect = CountVectorizer()
#term frequency count(word)/total words
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
#output [n_samples, n_features]

(11314, 130107)

In [7]:
#TF inverse document frequncy - removing common words like 'the' etc.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [8]:
#building Naive Bayes Classifier - clf
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [9]:
#building a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
        ('vect', CountVectorizer(stop_words='english')),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [10]:
#testing performance on test set
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8169144981412639

In [10]:
#77 % accuaracy & 81% with applying stop_words

In [11]:
#support vector machines
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
        ('vect', CountVectorizer(stop_words='english')),
        ('tfidf', TfidfTransformer()),
        ('clf-svm', SGDClassifier(loss='hinge', alpha=1e-3, n_iter=5, random_state=42)),
    ])
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8224907063197026

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [13]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [29]:
gs_clf.best_score_

0.9057804490012374

In [30]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}

In [14]:
#tuning params for svm with GridSearch
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3),
                 }

In [15]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

In [16]:
gs_clf_svm.best_score_

0.8954392787696659

In [17]:
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [18]:
import nltk

In [19]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [20]:
from nltk.stem.snowball import SnowballStemmer

In [21]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)


In [22]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vec = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline(
    [
        ('vect', stemmed_count_vec),
        ('tfidf', TfidfTransformer()),
        ('mnb', MultinomialNB(fit_prior=False)),
    ]
)

In [23]:
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8167817312798725

In [24]:
text_svm_stemmed = Pipeline(
    [
        ('vect', stemmed_count_vec),
        ('tfidf', TfidfTransformer()),
        ('msvm', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5, random_state=42)),
    ]
)

In [25]:
text_svm_stemmed = text_svm_stemmed.fit(twenty_train.data, twenty_train.target)
predicted_svm_stemmed = text_svm_stemmed.predict(twenty_test.data)
np.mean(predicted_svm_stemmed == twenty_test.target)

0.8194370685077005

In [26]:
from sklearn.metrics import classification_report
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

In [29]:
#classification report for stemmed svm
#The support is the number of occurrences of each class in y_true
y_pred = predicted_svm_stemmed
y_true = twenty_test.target
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.70      0.71      0.71       319
          1       0.79      0.73      0.76       389
          2       0.75      0.74      0.74       394
          3       0.72      0.67      0.70       392
          4       0.80      0.82      0.81       385
          5       0.88      0.78      0.82       395
          6       0.81      0.84      0.83       390
          7       0.89      0.89      0.89       396
          8       0.91      0.97      0.94       398
          9       0.91      0.91      0.91       397
         10       0.86      0.99      0.92       399
         11       0.83      0.96      0.89       396
         12       0.81      0.63      0.71       393
         13       0.88      0.87      0.87       396
         14       0.85      0.95      0.90       394
         15       0.73      0.94      0.82       398
         16       0.69      0.93      0.79       364
         17       0.92      0.91      0.92   

In [30]:
#classification report for stemmed mnb
y_pred = predicted_mnb_stemmed
y_true = twenty_test.target
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.70      0.75       319
          1       0.78      0.72      0.75       389
          2       0.82      0.69      0.75       394
          3       0.69      0.79      0.74       392
          4       0.85      0.83      0.84       385
          5       0.86      0.79      0.83       395
          6       0.88      0.75      0.81       390
          7       0.88      0.92      0.90       396
          8       0.93      0.96      0.94       398
          9       0.93      0.92      0.92       397
         10       0.91      0.98      0.94       399
         11       0.72      0.97      0.83       396
         12       0.83      0.64      0.72       393
         13       0.92      0.78      0.84       396
         14       0.83      0.94      0.88       394
         15       0.64      0.96      0.77       398
         16       0.65      0.95      0.77       364
         17       0.93      0.95      0.94   