# Text classification with sklearn

## Load data

In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


In [2]:
twenty_train.target_names # all the names of those group docs, there are 20 news groups

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [69]:
len(twenty_train)

6

In [144]:
print(len(twenty_train.data))
print(type(twenty_train.data))

11314
<class 'list'>


In [71]:
len(twenty_train.filenames) # 11314 docs in total

11314

In [79]:
print(type(twenty_train.data[0])) # is a string
print("\n".join(twenty_train.data[0].split("\n")))

<class 'str'>
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [7]:
print(twenty_train.target_names[twenty_train.target[0]]) # the first string belongs to rec.autos group

rec.autos


In [81]:
print(twenty_train.target[:20]) #labels from 0 to 20
print(len(set(twenty_train.target))) # there are 20 different lables for 11314 docs

[ 7  4  4  1 14 16 13  3  2  4  8 19  4 14  6  0  1  7 12  5]
20


## Extracting features from text files

### Bags of words

#### The most intuitive way to do so is the bags of words representation:
1. assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices).
2. for each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary

In [92]:
#Tokenizing text with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
#(11314, 130107) meaning: there are 11314 docs, and 130107 words in total

(11314, 130107)

In [93]:
count_vect.vocabulary_.get(u'algorithm') #the id of the word algorithm is 27366

27366

In [94]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) # fit our estimator to the data
X_train_tf = tf_transformer.transform(X_train_counts) #transform our count-matrix to a tf-idf representation.
X_train_tf.shape

(11314, 130107)

In [95]:
# These above two steps can be combined to achieve the same end result faster by skipping redundant processing.
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
(2257, 35788)

(2257, 35788)

In [96]:
print(type(X_train_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [101]:
print(X_train_tfidf[:5][:5])

  (0, 56979)	0.0574701540749
  (0, 75358)	0.353835013497
  (0, 123162)	0.259709024574
  (0, 118280)	0.211868072083
  (0, 50527)	0.0546142865886
  (0, 124031)	0.107987951542
  (0, 85354)	0.0369697850882
  (0, 114688)	0.0621407098631
  (0, 111322)	0.019156718025
  (0, 123984)	0.0368542926346
  (0, 37780)	0.381338912595
  (0, 68532)	0.0732581234213
  (0, 114731)	0.144472755128
  (0, 87620)	0.0356718631408
  (0, 95162)	0.0344713840933
  (0, 64095)	0.0354209242713
  (0, 98949)	0.160686060554
  (0, 90379)	0.0199288599566
  (0, 118983)	0.0370859780506
  (0, 89362)	0.065211743063
  (0, 79666)	0.109364012524
  (0, 40998)	0.0780136819692
  (0, 92081)	0.0991327449391
  (0, 76032)	0.0192194630522
  (0, 4605)	0.0633260395248
  :	:
  (4, 112674)	0.0467698334522
  (4, 108677)	0.0522405941446
  (4, 114692)	0.0484096661848
  (4, 39174)	0.082942690814
  (4, 101990)	0.0437467629308
  (4, 120616)	0.0781804227528
  (4, 128096)	0.054893702428
  (4, 32422)	0.0412946622535
  (4, 114646)	0.0575592668342
  (4, 

## Training a classifier

In [102]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

### Apply the classifier on user-defined doc

In [108]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [109]:
print(X_new_counts)

  (0, 59626)	1
  (0, 68532)	1
  (0, 76876)	1
  (1, 54467)	1
  (1, 59961)	1
  (1, 68532)	1
  (1, 89860)	1
  (1, 90045)	1
  (1, 114455)	1


In [110]:
print(X_new_tfidf)

  (0, 76876)	0.738377245684
  (0, 68532)	0.224074346259
  (0, 59626)	0.636073683156
  (1, 114455)	0.0829102764942
  (1, 90045)	0.631035912274
  (1, 89860)	0.115700544729
  (1, 68532)	0.0988601661347
  (1, 59961)	0.67662150823
  (1, 54467)	0.33755436536


## Building a pipeline

In [111]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])

In [113]:
text_clf.fit(twenty_train.data, twenty_train.target)  

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Evaluation of the performance on the test set

In [145]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.82381837493361654

In [149]:
print(type(docs_test[0]))
print(len(docs_test))

<class 'str'>
7532


## Use SVD method 

In [121]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42)),])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))])

In [122]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.96659006540569203

### check the detailed performance

In [123]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.95      0.96      0.95       480
           comp.graphics       0.98      0.94      0.96       584
 comp.os.ms-windows.misc       0.95      0.97      0.96       591
comp.sys.ibm.pc.hardware       0.94      0.94      0.94       590
   comp.sys.mac.hardware       0.99      0.97      0.98       578
          comp.windows.x       0.98      0.97      0.98       593
            misc.forsale       0.92      0.96      0.94       585
               rec.autos       0.98      0.98      0.98       594
         rec.motorcycles       0.98      0.99      0.99       598
      rec.sport.baseball       1.00      0.98      0.99       597
        rec.sport.hockey       0.97      1.00      0.98       600
               sci.crypt       0.98      1.00      0.99       595
         sci.electronics       0.99      0.94      0.97       591
                 sci.med       0.99      0.99      0.99       594
         

In [124]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[461,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1,  12,   0,   2,   0,   4],
       [  1, 549,  11,   7,   2,   4,   4,   0,   0,   0,   1,   0,   0,
          1,   4,   0,   0,   0,   0,   0],
       [  0,   2, 575,   7,   0,   4,   2,   0,   0,   0,   0,   0,   0,
          0,   0,   1,   0,   0,   0,   0],
       [  0,   3,  12, 555,   2,   1,  12,   0,   0,   0,   1,   3,   0,
          0,   0,   0,   1,   0,   0,   0],
       [  0,   1,   0,   9, 558,   1,   5,   0,   0,   1,   0,   1,   0,
          1,   0,   1,   0,   0,   0,   0],
       [  0,   5,   5,   2,   0, 577,   0,   0,   0,   0,   0,   1,   0,
          0,   3,   0,   0,   0,   0,   0],
       [  0,   0,   1,   4,   0,   0, 563,   4,   4,   0,   4,   1,   3,
          0,   0,   1,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   6, 581,   4,   0,   0,   0,   0,
          0,   2,   0,   1,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,   5,   1, 591,   0,   0,  

## Parameter tuning using grid search

In [125]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),}

In [126]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [127]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [128]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [129]:
gs_clf.best_score_ 

0.60250000000000004

In [130]:
for param_name in sorted(parameters.keys()):print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [132]:
help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  Exhaustive search over specified parameter values for an estimator.
 |  
 |  Important members are fit, predict.
 |  
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "predict", "predict_proba", "decision_function",
 |  "transform" and "inverse_transform" if they are implemented in the
 |  estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |  
 |  Read more in the :ref:`User Guide <grid_search>`.
 |  
 |  Parameters
 |  ----------
 |  estimator : estimator object.
 |      This is assumed to implement the scikit-learn estimator interface.
 |      Either estimator needs to provide a ``score`` function,
 |      or ``scoring`` must be passed.
 |  
 |  param_grid : dict or list of dictionaries
 |      Dictionary with parameters names (string) as ke

### Build another classifier by removing stop words

In [133]:
text_clf_1 = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])

In [134]:
text_clf_1.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [135]:
predicted = text_clf_1.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.95739791408873964

In [140]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [141]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [142]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                             ('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB(fit_prior=False)),])
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.95545342054092275