# Text analysis tutorial

In [69]:
categories = ['alt.atheism',
 #'comp.sys.mac.hardware',
 #'rec.motorcycles',
 #'sci.electronics',
 #'talk.politics.guns',
 'comp.graphics',
 #'comp.windows.x',
 #'rec.sport.baseball',
 'sci.med',
 #'talk.politics.mideast',
 #'comp.os.ms-windows.misc',
 #'misc.forsale',
 #'rec.sport.hockey',
 'sci.space',
 #'talk.politics.misc',
 #'comp.sys.ibm.pc.hardware',
 #'rec.autos',
 #'sci.crypt',
 #'soc.religion.christian',
 #'talk.religion.misc',
             ]

In [70]:
# twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
from sklearn.datasets import load_files
twenty_train = load_files("../data/20news-bydate-train/", 
                           categories=categories, shuffle=True, random_state=42, 
                           encoding="iso-8859-1")

In [71]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [72]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

Subject: Space FAQ 03/15 - Data Sources
From: leech@cs.unc.edu (Jon Leech)
Expires: 6 May 1993 19:55:35 GMT


In [73]:
print(twenty_train.target_names[twenty_train.target[0]])

sci.space


In [74]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=20000) # only pick top `n_features` of words 
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2251, 20000)

In [75]:
count_vect.vocabulary_.get(u'algorithm')

2063

# Term Frequencies

In [76]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2251, 20000)

In [77]:
int2word = {v: k for k,v in count_vect.vocabulary_.items()} # create a dict: index -> word
print(len(int2word))

20000


In [78]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2251, 20000)

In [79]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [80]:
X_train_counts.shape

(2251, 20000)

In [81]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Do you have cancer?']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => alt.atheism
'OpenGL on the GPU is fast' => comp.graphics
'Do you have cancer?' => sci.med


In [82]:
clf.predict_proba(X_new_tfidf)

array([[0.67368547, 0.12550557, 0.10976496, 0.091044  ],
       [0.1480633 , 0.42162331, 0.21316603, 0.21714735],
       [0.14029191, 0.11734306, 0.62045435, 0.12191068]])

In [83]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'sci.space']

# Building a pipeline

In [84]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [85]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

# Evaluation of performance

In [86]:
import numpy as np
#twenty_test = fetch_20newsgroups(subset='test',
#    categories=categories, shuffle=True, random_state=42)
twenty_test = load_files("../data/20news-bydate-test/", 
                           categories=categories, shuffle=True, random_state=42, 
                           encoding="iso-8859-1")
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9292389853137517

In [87]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.8978638184245661

In [88]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))
                                        
metrics.confusion_matrix(twenty_test.target, predicted)

               precision    recall  f1-score   support

  alt.atheism       0.95      0.88      0.92       319
comp.graphics       0.81      0.95      0.88       389
      sci.med       0.92      0.83      0.87       396
    sci.space       0.94      0.93      0.93       394

    micro avg       0.90      0.90      0.90      1498
    macro avg       0.91      0.90      0.90      1498
 weighted avg       0.90      0.90      0.90      1498



array([[281,  16,  15,   7],
       [  3, 371,   4,  11],
       [  8,  54, 328,   6],
       [  3,  17,   9, 365]])

# Tuning

In [89]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 5e-3, 1e-3),
}

In [90]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [91]:
gs_clf = gs_clf.fit(twenty_train.data[:2000], twenty_train.target[:2000])

In [92]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'alt.atheism'

In [93]:
prob = gs_clf.predict_proba(['Are you a republican'])[0]
ind = np.argsort(prob)
for i in ind[::-1]:
    print("%20s : %5f " % (twenty_train.target_names[i], prob[i]))

       comp.graphics : 0.316495 
             sci.med : 0.249613 
           sci.space : 0.220432 
         alt.atheism : 0.213461 


In [94]:
gs_clf.best_score_                                  

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [95]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

# Create the model with the best parameters

In [96]:
best_clf = text_clf.set_params(**gs_clf.best_params_)

In [97]:
predicted = best_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.8978638184245661

# Saving the model

https://scikit-learn.org/stable/modules/model_persistence.html

In [99]:
import joblib

In [100]:
joblib.dump(best_clf, 'best_clf.joblib') 

['best_clf.joblib']

In [36]:
"alt.atheism               comp.sys.mac.hardware  rec.motorcycles     sci.electronics         talk.politics.guns comp.graphics             comp.windows.x         rec.sport.baseball  sci.med                 talk.politics.mideast comp.os.ms-windows.misc   misc.forsale           rec.sport.hockey    sci.space               talk.politics.misc comp.sys.ibm.pc.hardware  rec.autos              sci.crypt           soc.religion.christian  talk.religion.misc".split() 

['alt.atheism',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns',
 'comp.graphics',
 'comp.windows.x',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'comp.os.ms-windows.misc',
 'misc.forsale',
 'rec.sport.hockey',
 'sci.space',
 'talk.politics.misc',
 'comp.sys.ibm.pc.hardware',
 'rec.autos',
 'sci.crypt',
 'soc.religion.christian',
 'talk.religion.misc']