# Text analysis tutorial

In [1]:
categories = ['alt.atheism',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns',
 'comp.graphics',
 'comp.windows.x',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'comp.os.ms-windows.misc',
 'misc.forsale',
 'rec.sport.hockey',
 'sci.space',
 'talk.politics.misc',
 'comp.sys.ibm.pc.hardware',
 'rec.autos',
 'sci.crypt',
 'soc.religion.christian',
 'talk.religion.misc']

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
# twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
from sklearn.datasets import load_files
twenty_train = load_files("../data/twenty_newsgroups/20news-bydate-train/", 
                           categories=categories, shuffle=True, random_state=42, 
                           encoding="iso-8859-1")

In [4]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: wtm@uhura.neoucom.edu (Bill Mayhew)
Subject: Re: How to the disks copy protected.
Organization: Northeastern Ohio Universities College of Medicine


In [6]:
print(twenty_train.target_names[twenty_train.target[0]])

sci.electronics


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=10000) # only pick top `n_features` of words 
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 10000)

In [8]:
count_vect.vocabulary_.get(u'algorithm')

895

# Term Frequencies

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(11314, 10000)

In [11]:
int2word = {v: k for k,v in count_vect.vocabulary_.items()} # create a dict: index -> word
print(len(int2word))

10000


In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 10000)

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [14]:
X_train_counts.shape

(11314, 10000)

In [15]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Do you have cancer?']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos
'Do you have cancer?' => sci.med


In [16]:
clf.predict_proba(X_new_tfidf)

array([[0.10498179, 0.01888494, 0.01804034, 0.02425729, 0.01223267,
        0.01444002, 0.01629914, 0.0251331 , 0.02808182, 0.01970591,
        0.01531814, 0.01247666, 0.01132002, 0.01552304, 0.01265812,
        0.46757863, 0.02596826, 0.02229762, 0.02684049, 0.10796201],
       [0.02174818, 0.09954744, 0.0572469 , 0.09007862, 0.06895243,
        0.05487108, 0.02386554, 0.12598259, 0.08188731, 0.02901914,
        0.02864597, 0.05735737, 0.04856215, 0.0393438 , 0.03743056,
        0.03732151, 0.03308784, 0.02481849, 0.02453403, 0.01569905],
       [0.0388627 , 0.0382329 , 0.03964696, 0.04597281, 0.04388186,
        0.03833946, 0.02785334, 0.04341352, 0.04147131, 0.0379089 ,
        0.03462979, 0.04480947, 0.03977167, 0.24674547, 0.03649542,
        0.05117107, 0.04230305, 0.04090467, 0.03998116, 0.02760447]])

In [17]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# Building a pipeline

In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [19]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

# Evaluation of performance

In [20]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [37]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.7498672331386086

In [38]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))
                                        
metrics.confusion_matrix(twenty_test.target, predicted)

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.48      0.59       319
           comp.graphics       0.77      0.66      0.71       389
 comp.os.ms-windows.misc       0.71      0.77      0.74       394
comp.sys.ibm.pc.hardware       0.69      0.69      0.69       392
   comp.sys.mac.hardware       0.84      0.72      0.77       385
          comp.windows.x       0.80      0.72      0.76       395
            misc.forsale       0.58      0.89      0.70       390
               rec.autos       0.86      0.87      0.87       396
         rec.motorcycles       0.91      0.92      0.92       398
      rec.sport.baseball       0.83      0.84      0.83       397
        rec.sport.hockey       0.87      0.96      0.91       399
               sci.crypt       0.78      0.89      0.83       396
         sci.electronics       0.85      0.49      0.62       393
                 sci.med       0.88      0.69      0.77       396
         

array([[153,   1,   0,   0,   0,   0,   8,   1,   3,   5,   0,   1,   1,
          8,   1, 115,   1,  16,   0,   5],
       [  1, 257,  19,  12,   7,  27,  22,   1,   1,   3,   2,  11,   1,
          1,   9,  12,   2,   1,   0,   0],
       [  2,  10, 302,  24,   5,   8,   7,   1,   1,   5,   0,   7,   1,
          1,   4,  13,   2,   1,   0,   0],
       [  1,   4,  34, 272,  13,   4,  20,   7,   1,   2,   1,   8,  13,
          0,   6,   5,   1,   0,   0,   0],
       [  0,   3,  12,  37, 277,   3,  24,   1,   3,   5,   1,   4,   5,
          0,   3,   5,   1,   1,   0,   0],
       [  0,  27,  44,   4,   3, 284,  11,   1,   0,   3,   0,   7,   0,
          0,   6,   3,   2,   0,   0,   0],
       [  0,   0,   1,  18,   5,   0, 349,   5,   1,   1,   2,   0,   2,
          1,   1,   4,   0,   0,   0,   0],
       [  1,   1,   1,   2,   1,   2,  20, 346,   4,   2,   0,   0,   6,
          0,   5,   0,   4,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,   8,  10, 368,   2,   0,  

# Tuning

In [39]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 5e-3, 1e-3),
}

In [40]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [41]:
gs_clf = gs_clf.fit(twenty_train.data[:2000], twenty_train.target[:2000])

In [42]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [56]:
prob = gs_clf.predict_proba(['I am selling this crap.'])[0]
ind = np.argsort(prob)
for i in ind[::-1]:
    print("%20s : %5f " % (twenty_train.target_names[i], prob[i]))

        misc.forsale : 0.074285 
      comp.windows.x : 0.057950 
           rec.autos : 0.057457 
comp.os.ms-windows.misc : 0.055891 
       comp.graphics : 0.055244 
comp.sys.ibm.pc.hardware : 0.054361 
    rec.sport.hockey : 0.054028 
  rec.sport.baseball : 0.051734 
             sci.med : 0.051517 
     rec.motorcycles : 0.050990 
     sci.electronics : 0.050805 
           sci.space : 0.049349 
comp.sys.mac.hardware : 0.048766 
  talk.politics.guns : 0.044031 
soc.religion.christian : 0.043190 
           sci.crypt : 0.043150 
         alt.atheism : 0.040467 
  talk.politics.misc : 0.040148 
talk.politics.mideast : 0.038906 
  talk.religion.misc : 0.037731 


In [57]:
gs_clf.best_score_                                  

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [58]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

# Create the model with the best parameters

In [59]:
best_clf = text_clf.set_params(**gs_clf.best_params_)

In [60]:
predicted = best_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.7498672331386086

# Saving the model

https://scikit-learn.org/stable/modules/model_persistence.html

In [31]:
import joblib

In [32]:
joblib.dump(best_clf, 'best_clf.joblib') 

['best_clf.joblib']

In [36]:
"alt.atheism               comp.sys.mac.hardware  rec.motorcycles     sci.electronics         talk.politics.guns comp.graphics             comp.windows.x         rec.sport.baseball  sci.med                 talk.politics.mideast comp.os.ms-windows.misc   misc.forsale           rec.sport.hockey    sci.space               talk.politics.misc comp.sys.ibm.pc.hardware  rec.autos              sci.crypt           soc.religion.christian  talk.religion.misc".split() 

['alt.atheism',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns',
 'comp.graphics',
 'comp.windows.x',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'comp.os.ms-windows.misc',
 'misc.forsale',
 'rec.sport.hockey',
 'sci.space',
 'talk.politics.misc',
 'comp.sys.ibm.pc.hardware',
 'rec.autos',
 'sci.crypt',
 'soc.religion.christian',
 'talk.religion.misc']