In [1]:
from sklearn.datasets import fetch_20newsgroups
'''Twenty Newsgroups datasets is a collection of ~20K newsgroup
documents, evenly across 20 different newsgroup'''

'Twenty Newsgroups datasets is a collection of ~20K newsgroup\ndocuments, evenly across 20 different newsgroup'

In [2]:
###select categories to work on 
categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med']

In [3]:
##load matching categories, returned data is a holder object that can be accessed as python dict keys or object
twenty_train = fetch_20newsgroups(subset= 'train',categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
###fetched files are locaded in memory
len(twenty_train.data)

2257

In [6]:
len(twenty_train.filenames)

2257

In [7]:
##print first few lines of the first loaded file
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [8]:
###conversion between label and level
for t in twenty_train.target[:5]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian


In [10]:
###extracting features from text
##bags of words ---get occurences of each word in each document --high-dim sparse datasets
##us
##tokenizing text , CountVectorizer support counts of N-grams of words or consequective characters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [11]:
##after fitted, the vectorizer has built a dictionary of feature indices:
##index value of a word is linked to its frequency in the whole training corpus
count_vect.vocabulary_.get(u'algorithm')

4690

In [12]:
##from occurences to frequencies -- 1. divide the # of occurences by the total # of words in the document --Term
##frequencies (tf), downscale weight for words that occur in many documents that are less informative -Term 
##Frequency times Inverse Document Frequency (tf-idf)
from sklearn.feature_extraction.text import TfidfTransformer
##use fit() to fit estimator to the data
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
##use transform() to transform countmatrix to a tf-idf representation
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [13]:
##a faster way is using fit_transform()
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [14]:
##training a classifier using naive Bayes classifier, most suitable for word counts is multinomial variant
from sklearn.naive_bayes import MultinomialNB
##alpha is a smoothing parameter
clf = MultinomialNB().fit(X_train_tfidf,twenty_train.target)
##predict
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
##since they've already been fit to the training sets, here using transform rather than fit_transform
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [15]:
##check result
for doc,category in zip(docs_new,predicted):
    print('%r -> %s' % (doc,twenty_train.target_names[category]))

'God is love' -> soc.religion.christian
'OpenGL on the GPU is fast' -> comp.graphics


In [19]:
##build a pipeline for vectorizer -> transformer -> classifier
##names vect, tfidf, clf are arbitrary
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf',MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data,twenty_train.target)
predict2 = text_clf.predict(docs_new)
for doc,category1,category2 in zip(docs_new,predicted,predict2):
    print('%r -> %s -> %s' % (doc,twenty_train.target_names[category1],twenty_train.target_names[category2]))

'God is love' -> soc.religion.christian -> soc.religion.christian
'OpenGL on the GPU is fast' -> comp.graphics -> comp.graphics


In [20]:
###evaluation predictive accuracy
import numpy as np
twenty_test = fetch_20newsgroups(subset= 'test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [26]:
####using support vector machine 
####alpha is a penalty parameter
from sklearn.linear_model import SGDClassifier
text_clf2 = Pipeline([('vect',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf',SGDClassifier(loss='hinge',penalty='l2',alpha=1e-3,n_iter=5,random_state=42))])
text_clf2 = text_clf2.fit(twenty_train.data,twenty_train.target)
predicted2 = text_clf2.predict(docs_test)
np.mean(predicted2 == twenty_test.target)

0.9127829560585885

In [27]:
###more utilities for performance analysis
from sklearn import metrics
print(metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



In [28]:
metrics.confusion_matrix(twenty_test.target,predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

In [29]:
print(metrics.classification_report(twenty_test.target,predicted2,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [30]:
metrics.confusion_matrix(twenty_test.target,predicted2)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [69]:
###search for best parameters on a grid of possible values
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3)}
text_clf3 = Pipeline([('vect',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf',SGDClassifier(loss='hinge',penalty='l2',n_iter=5,random_state=42))])
##tell grid searcher to try parameters in parallel with n_jobs = -1, tetect how many cores are installed and use all
gs_clf = GridSearchCV(text_clf3,parameters,n_jobs = -1)
##using subset
text_clf3.get_params().keys()

dict_keys(['vect__ngram_range', 'vect__analyzer', 'vect__token_pattern', 'clf__learning_rate', 'clf__shuffle', 'vect__min_df', 'clf__l1_ratio', 'vect__binary', 'tfidf', 'clf__power_t', 'vect__lowercase', 'vect__strip_accents', 'vect__decode_error', 'vect__input', 'vect__max_df', 'tfidf__norm', 'clf__loss', 'clf__n_iter', 'clf__fit_intercept', 'clf', 'vect__preprocessor', 'clf__verbose', 'vect__dtype', 'clf__n_jobs', 'vect__encoding', 'vect__tokenizer', 'clf__epsilon', 'clf__penalty', 'tfidf__smooth_idf', 'clf__average', 'clf__warm_start', 'clf__alpha', 'vect__max_features', 'clf__class_weight', 'vect', 'tfidf__use_idf', 'steps', 'clf__eta0', 'vect__vocabulary', 'vect__stop_words', 'clf__random_state', 'tfidf__sublinear_tf'])

In [70]:
gs_clf_pd = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [71]:
##prediction
twenty_train.target_names[gs_clf_pd.predict(['God is love'])]



'soc.religion.christian'

In [73]:
##get optimal parameters by inspecting object's grid_scores_ attribute
best_parameters,score,_ = max(gs_clf.grid_scores_,key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name,best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [74]:
score

0.90000000000000002