## Working with Text Data

In [1]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [4]:
help(fetch_20newsgroups)

Help on function fetch_20newsgroups in module sklearn.datasets.twenty_newsgroups:

fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)
    Load the filenames and data from the 20 newsgroups dataset.
    
    Read more in the :ref:`User Guide <20newsgroups>`.
    
    Parameters
    ----------
    subset: 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.
    
    data_home: optional, default: None
        Specify a download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    
    categories: None or collection of string or unicode
        If None (default), load all the categories.
        If not None, list of category names to load (other categories
        ignored).
    
    shuffle: bool, optional
        Wh

In [5]:
type(twenty_train)

sklearn.datasets.base.Bunch

In [6]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [8]:
len(twenty_train.data)

2257

In [9]:
twenty_train.data[0]

u'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [10]:
print twenty_train.data[0]

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [11]:
print twenty_train.target[0]

1


In [13]:
twenty_train.target_names[twenty_train.target[0]]

'comp.graphics'

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
count_vect=CountVectorizer()

In [18]:
X_train_counts=count_vect.fit_transform(twenty_train.data)

In [19]:
X_train_counts.shape

(2257, 35788)

In [25]:
X_train_counts[0,count_vect.vocabulary_.get('to')]

4

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

In [28]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [29]:
X_train_tf = tf_transformer.transform(X_train_counts)

In [30]:
X_train_tf[0,count_vect.vocabulary_.get('to')]

0.30151134457776363

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
tfidf_transformer = TfidfTransformer()

In [34]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [36]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [37]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [38]:
X_new_counts = count_vect.transform(docs_new)

In [39]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [40]:
predicted = clf.predict(X_new_tfidf)

In [45]:
print predicted

[3 1]


In [44]:
print twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


#### Building a pipeline

In [46]:
from sklearn.pipeline import Pipeline

In [50]:
text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])

In [51]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

#### Evaluation of the performance on the test set

In [52]:
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)

In [53]:
docs_test = twenty_test.data

In [54]:
predicted = text_clf.predict(docs_test)

In [58]:
len(docs_test)

1502

In [60]:
import numpy as np
np.mean(predicted == twenty_test.target)

0.83488681757656458

#### using SVM

In [61]:
from sklearn.linear_model import SGDClassifier

In [63]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])

In [64]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))])

In [65]:
predicted = text_clf.predict(docs_test)

In [66]:
np.mean(predicted == twenty_test.target)

0.9127829560585885

#### Performance analysis of the result

In [67]:
from sklearn import metrics

In [68]:
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [69]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [71]:
258/(319.0)

0.8087774294670846

In [73]:
258/(272.0)

0.9485294117647058

#### Parameter tuning using grid search

In [74]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              }

In [76]:
from sklearn.grid_search import GridSearchCV
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [77]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [80]:
twenty_train.target_names[gs_clf.predict(['God is love'])]

  if __name__ == '__main__':


'soc.religion.christian'

In [84]:
gs_clf.grid_scores_

[mean: 0.87750, std: 0.01499, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
 mean: 0.87500, std: 0.02844, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
 mean: 0.76500, std: 0.05115, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
 mean: 0.78000, std: 0.05599, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
 mean: 0.90000, std: 0.03086, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
 mean: 0.89000, std: 0.02544, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
 mean: 0.76750, std: 0.04626, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.001},
 mean: 0.81000, std: 0.04243, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.001}]

In [85]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])

In [86]:
best_parameters

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [87]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
