Loading the dataset - training data

In [2]:
from sklearn.datasets import fetch_20newsgroups

ModuleNotFoundError: No module named 'numpy.testing.decorators'

In [2]:
twenty_train = fetch_20newsgroups(subset = 'train', shuffle = True)

Check the target names (categories) and some data files

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


In [7]:
print(":".join("Python"))

P:y:t:h:o:n


In [8]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


In [9]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [10]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [11]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [12]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as 
# follows"
# The names 'vect', 'tfidf', 'clf' are arbitrary but will be used later
# We will be using the 'text_clf' going forward.

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf',MultinomialNB())])

In [15]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [16]:
# Performance of NB Classifier
import numpy as np
twenty_test = fetch_20newsgroups(subset = 'test', shuffle = True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [17]:
# Training Support Vector Machines - SVM and calculating its performance
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), 
                         ('tfidf', TfidfTransformer()),
                        ('clf-svm', SGDClassifier(loss = 'hinge', 
                                                  penalty = 'l2', 
                                                  alpha = 1e-3, 
                                                  n_iter =5, 
                                                  random_state=42))])

In [18]:
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)



0.8238183749336165

Grid Search
Creating a list of parameters for which we would like to do performance tuning.
All the parameters name start with the classifier name (remember the arbitrary name we gave).

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,1),(1,2)], 
              'tfidf__use_idf':(True, False),
             'clf__alpha':(1e-2,1e-3)}

In [None]:
# Create an instance of the grid search by passing the classifier, parameters and 
# n_jobs = -1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
# To see the best mean score and the params, run the following code
gs_clf.best_score_

In [None]:
gs_clf.best_params_

------------

In [25]:
# Similarly doing Grid Search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range':[(1,1),(1,2)],
                 'tfidf__use_idf':(True, False),
                 'clf-svm__alpha':(1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)



In [26]:
gs_clf_svm.best_score_

0.8979140887396146

In [27]:
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

----------

In [28]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

In [29]:
# Stemming Code
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [30]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [34]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc:([stemmer.stem(w) for w in analyzer(doc)])

In [35]:
stemmed_count_vect = StemmedCountVectorizer(stop_words = 'english')

In [36]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                            ('tfidf', TfidfTransformer()),
                            ('mnb', MultinomialNB(fit_prior = False))])

In [37]:
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

In [39]:
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

In [40]:
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8167817312798725