# Pipeline

                                Data Collection
                                        |
                                        |
                                CountVectorizer
                                        |
                                        |
                                TfidfTransformer
                                        |
                                        |
                        SGDClassifier and MultinomialNB
                                        |
                                        |
                                  GridSearchCV   
                                        |
                                        |
                               Model Performance


## 1. Data Collection

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset = 'train', shuffle = True)

In [3]:
twenty_train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


## 2. Data Preprocessing
### 2.1 CountVectorizer


Convert a collection of text documents to a matrix of token counts.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [6]:
count_vect.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '00000000',
 '0000000004',
 '0000000005',
 '00000000b',
 '00000001',
 '00000001b',
 '0000000667',
 '00000010',
 '00000010b',
 '00000011',
 '00000011b',
 '0000001200',
 '00000074',
 '00000093',
 '000000e5',
 '00000100',
 '00000100b',
 '00000101',
 '00000101b',
 '00000110',
 '00000110b',
 '00000111',
 '00000111b',
 '00000315',
 '000005102000',
 '00000510200001',
 '000007',
 '00000ee5',
 '00001000',
 '00001000b',
 '00001001',
 '00001001b',
 '00001010',
 '00001010b',
 '00001011',
 '00001011b',
 '000010af',
 '00001100',
 '00001100b',
 '00001101',
 '00001101b',
 '00001110',
 '00001110b',
 '00001111',
 '00001111b',
 '000021',
 '000042',
 '000062david42',
 '000094',
 '0000vec',
 '0001',
 '00010000',
 '00010000b',
 '00010001',
 '00010001b',
 '00010010',
 '00010010b',
 '00010011',
 '00010011b',
 '000100255pixel',
 '00010100',
 '00010100b',
 '00010101',
 '00010101b',
 '00010110',
 '00010110b',
 '00010111',
 '00010111b',
 '00011000',
 '00011000b',
 '00

In [7]:
iter(X_train_counts)

<generator object __iter__ at 0x7fc14634ef10>

In [8]:
print(X_train_counts.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### 2.2 TfidfTransformer

Transform a count matrix to a normalized tf or tf-idf representation

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

## 3. Text Classification

### 3.1 MultinomialNB

In [10]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [11]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf',MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [12]:
import numpy as np

twenty_test = fetch_20newsgroups(subset = 'test', shuffle = True)
predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target)

0.7738980350504514

### 3.2 SGDClassifier

In [13]:
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()),('tfidf',TfidfTransformer()),
                         ('clf-svm',SGDClassifier(loss = 'hinge', 
                                                  penalty = 'l2', 
                                                  alpha = 1e-3,max_iter = 5,
                                                 random_state = 42))])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)



0.8248805098247477

## 4. GridSearch

Find the best parameters

### 4.1 GridSearch for MultinomialNB

In [14]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,1),(1,2)],
              'tfidf__use_idf':(True,False),
              'clf__alpha':(1e-2,1e-3)}

In [15]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)



In [16]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)
# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

0.9157684864695698
{'clf__alpha': 0.001, 'vect__ngram_range': (1, 2), 'tfidf__use_idf': True}


### 4.2 GridSearch for MultinomialNB

In [17]:
from sklearn.model_selection import GridSearchCV

parameters_svm = {'vect__ngram_range':[(1,1),(1,2)],
                 'tfidf__use_idf':(True,False),
                 'clf-svm__alpha':(1e-2,1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.9047198366213406
{'clf-svm__alpha': 0.001, 'vect__ngram_range': (1, 2), 'tfidf__use_idf': True}




In [18]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect',CountVectorizer(stop_words = 'english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf',MultinomialNB())])


In [19]:
# Try NLTK

import nltk
nltk.download()

# Removing stop words
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords = True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc:([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words = 'english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                            ('tfidf',TfidfTransformer()),
                            ('mnb',MultinomialNB(fit_prior = False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


0.8167817312798725

## 5 Model Performance 

### 5.1 Cross Validation

In [20]:
from sklearn.model_selection import cross_val_score

cross_val_score(gs_clf,twenty_train.data, twenty_train.target, cv=3, scoring = "accuracy" )

array([0.90562036, 0.9032087 , 0.90559533])

All accuracy larger than 90%.

### 5.2 Confusion Matrix

In [21]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(gs_clf,twenty_train.data, twenty_train.target, cv=3)


confusion_matrix(twenty_train.target, y_train_pred)

array([[442,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,  19,   2,   1,   1,  14],
       [  0, 487,  18,  25,   8,  18,  11,   3,   1,   0,   0,   1,   1,
          2,   4,   1,   2,   1,   0,   1],
       [  0,  25, 487,  41,   7,  13,   7,   0,   0,   1,   0,   3,   6,
          0,   1,   0,   0,   0,   0,   0],
       [  0,  16,  28, 489,  27,   1,   9,   1,   0,   0,   0,   3,  13,
          1,   1,   0,   0,   0,   0,   1],
       [  1,   6,   7,  30, 501,   7,   8,   1,   3,   2,   0,   5,   6,
          1,   0,   0,   0,   0,   0,   0],
       [  0,  25,   7,   7,   4, 533,   3,   2,   2,   1,   1,   4,   1,
          1,   1,   1,   0,   0,   0,   0],
       [  0,   7,   9,  29,  10,   3, 464,  23,   7,   5,   3,   2,  13,
          1,   4,   1,   3,   0,   0,   1],
       [  1,   4,   0,   2,   3,   1,  10, 535,  19,   3,   0,   0,   9,
          0,   4,   0,   2,   0,   1,   0],
       [  0,   2,   0,   2,   1,   1,   6,   6, 571,   0,   1,  

* row represents the actual category
* col represents the predicted category
