## Importing the 20newsgroup training data and Other Libraries


In [4]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [5]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [6]:
#Printing all the categories 

from pprint import pprint
pprint(list(twenty_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


## Extracting features from text files
#### In order to perform machine learning on text documents, need to turn the text into numerical feature vectors.

### Tokenizing Text wit Scikit - Learn

In [7]:
#CountVectorizer: It includes Tokenizing and filtering of stopwords
#Builds a dictionary of features and transforms documents to feature vectors

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

### TF-IDF Vectors

In [8]:
#TfidfTransformer(): Computes both Term frequency and Inverse document frequency
#fit() method to fit estimator to the data and transform() method to transform count-matrix to a tf-idf representation

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [9]:
#Transforming text to feature vectors that can be used as input to estimator using TfidfVectorizer()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(twenty_train.data)
vectors.shape

(11314, 130107)

###  Training a Classifier 
#### MultinomialNB classifier showing accuracy of 93.26% and F1 score of 0.75 when Hyperparameter is 1.0.


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
twenty_test = fetch_20newsgroups(subset='test',
                                     )
vectors_test = vectorizer.transform(twenty_test.data)
clf = MultinomialNB(alpha=1.0)
clf.fit(vectors, twenty_train.target)
pred = clf.predict(vectors_test)
acc_MultinomialNB = round(clf.score(vectors, twenty_train.target)* 100, 2)
print("MultinomialNB accuracy is:", acc_MultinomialNB)
F1_acc_MultinomialNB = (metrics.f1_score(twenty_test.target, pred, average='macro'))
print("F1 score for MultinomialNB is:", F1_acc_MultinomialNB )

MultinomialNB accuracy is: 93.26
F1 score for MultinomialNB is: 0.7557542971333199


### Building a pipeline

In [18]:
#Pipeline class behaves like a compound Classifier

text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [19]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [22]:
#Evaluating the predictive accuracy of the model

import numpy as np
twenty_test = fetch_20newsgroups(subset='test')
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8248805098247477

## Q1) a) Apply SVM algorithm and see how accuracy changes: 
### The accuracy with SVM algorithm is 82.49%

In [23]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(docs_test)
round(np.mean(predicted == twenty_test.target)*100,2)

82.49

## Classification_report for SGDClassifier

In [24]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))


                          precision    recall  f1-score   support

             alt.atheism       0.73      0.71      0.72       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.73      0.78      0.75       394
comp.sys.ibm.pc.hardware       0.74      0.67      0.70       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.84      0.76      0.80       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.91      0.90      0.90       396
         rec.motorcycles       0.93      0.96      0.95       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.88      0.99      0.93       399
               sci.crypt       0.84      0.96      0.90       396
         sci.electronics       0.83      0.62      0.71       393
                 sci.med       0.87      0.86      0.87       396
         

 ## b) Apply KNeighborsClassifier and see how accuracy changes.
 ### The accuracy with KNeighborsClassifier is 65.79%, (n_neighbors = 3)

In [71]:
from sklearn.neighbors import KNeighborsClassifier

text_knn = Pipeline([print 
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors = 3)),
])

text_knn.fit(twenty_train.data, twenty_train.target)

predicted = text_knn.predict(docs_test)
round(np.mean(predicted == twenty_test.target)*100,2)



65.79

## Classification_report for KNeighborsClassifier

In [72]:
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.43      0.75      0.54       319
           comp.graphics       0.45      0.65      0.53       389
 comp.os.ms-windows.misc       0.52      0.56      0.54       394
comp.sys.ibm.pc.hardware       0.52      0.62      0.57       392
   comp.sys.mac.hardware       0.55      0.58      0.56       385
          comp.windows.x       0.69      0.59      0.64       395
            misc.forsale       0.59      0.48      0.53       390
               rec.autos       0.75      0.68      0.71       396
         rec.motorcycles       0.83      0.80      0.82       398
      rec.sport.baseball       0.77      0.75      0.76       397
        rec.sport.hockey       0.88      0.82      0.85       399
               sci.crypt       0.74      0.83      0.78       396
         sci.electronics       0.72      0.47      0.57       393
                 sci.med       0.78      0.51      0.62       396
         

### Accuracy with KNeighborsClassifier when n_neighbors = 10, is 65.37% and the Classification_report

In [73]:
from sklearn.neighbors import KNeighborsClassifier

text_knn = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors = 10)),
])

text_knn.fit(twenty_train.data, twenty_train.target)

predicted = text_knn.predict(docs_test)
round(np.mean(predicted == twenty_test.target)*100,2)


65.37

In [74]:
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.45      0.76      0.56       319
           comp.graphics       0.57      0.57      0.57       389
 comp.os.ms-windows.misc       0.63      0.60      0.61       394
comp.sys.ibm.pc.hardware       0.56      0.59      0.57       392
   comp.sys.mac.hardware       0.59      0.50      0.54       385
          comp.windows.x       0.69      0.62      0.65       395
            misc.forsale       0.64      0.43      0.51       390
               rec.autos       0.75      0.69      0.72       396
         rec.motorcycles       0.81      0.78      0.79       398
      rec.sport.baseball       0.71      0.73      0.72       397
        rec.sport.hockey       0.82      0.85      0.83       399
               sci.crypt       0.73      0.83      0.78       396
         sci.electronics       0.66      0.45      0.53       393
                 sci.med       0.82      0.46      0.59       396
         

## d) Set the tfidf vectorizer parameter to use bigram and see how the accuracy changes.
#### Classifier choosen: MultinomialNB(Accuracy = 98.42%) and KNeighborsClassifier(n_neighbors = 3) (Accuracy = 89.82%)

In [103]:
# Performance of NB Classifier with ngram_range = (2,2)
text_clf = Pipeline([
        ('vect', TfidfVectorizer(ngram_range = (2,2))), 
        ('clf', MultinomialNB())])  

text_clf.fit(twenty_train.data, twenty_train.target)
round(text_clf.score(twenty_train.data, twenty_train.target)*100, 2)

98.42

In [105]:
# Performance of KNeighborsClassifier with ngram_range = (2,2)
text_clf = Pipeline([
        ('vect', TfidfVectorizer(ngram_range = (2,2))), 
        ('clf',KNeighborsClassifier(n_neighbors = 3))])  
text_clf.fit(twenty_train.data, twenty_train.target)
round(text_clf.score(twenty_train.data, twenty_train.target)*100, 2)

89.82

### e) Set tfidf vectorizer argument to use stop_words='english' and see how accuracy changes: KNeighborsClassifier(n_neighbors = 3) Accuracy = 92.35%

In [102]:
text_clf = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english', binary = True, sublinear_tf=True)), 
        ('clf',KNeighborsClassifier(n_neighbors = 3))])  

text_clf.fit(twenty_train.data, twenty_train.target)
round(text_clf.score(twenty_train.data, twenty_train.target)*100, 2)


92.35