In [0]:
# load dataset

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
import numpy as np

In [0]:
twenty_train.DESCR

In [39]:
# categories in the dataset
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [40]:
# each observation in dataset
twenty_train.data[0].split('\n')

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu',
 'Organization: University of Maryland, College Park',
 'Lines: 15',
 '',
 ' I was wondering if anyone out there could enlighten me on this car I saw',
 'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 'early 70s. It was called a Bricklin. The doors were really small. In addition,',
 'the front bumper was separate from the rest of the body. This is ',
 'all I know. If anyone can tellme a model name, engine specs, years',
 'of production, where this car is made, history, or whatever info you',
 'have on this funky looking car, please e-mail.',
 '',
 'Thanks,',
 '- IL',
 '   ---- brought to you by your neighborhood Lerxst ----',
 '',
 '',
 '',
 '',
 '']

In [41]:
print('X_train length {}'.format(len(twenty_train.data)))
print('y_train length {}'.format(len(twenty_train.target)))

# so there are total 11314 datapoints in the training data

X_train length 11314
y_train length 11314


In [42]:
# In countvectorizer, we create a vector for each datapoint here, each sentence. and then against each word, we have its count.


# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(twenty_train.data)
X_train_counts.shape

# this means that we have total 11314 vectors , each with length of 130107. Each word is represented in numeric terms within each datapoint. 
# for eg: first all of the sentences from the dataset are taken and are divided into different words. Each word is then assigned a unique id. such as {'word': id}
# then a vector is created against each id. the presence of each id corresponding value i.e. if its occuring once, it will be 1, and 2 and so on.


(11314, 130107)

In [43]:
print(X_train_counts.toarray()[0])

[0 0 0 ... 0 0 0]


In [44]:
# tg-idf 

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()   
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)      # it is important to pass count_vectors to this
X_train_tfidf.shape

# shape of this sparse matrix will be same as X_train_counts, only that it will be normalzed and will be between 0 and 1

(11314, 130107)

In [45]:
print(X_train_tfidf.toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf,twenty_train.target)

In [47]:
# import the testing data and check the model performance

twenty_test = fetch_20newsgroups(subset='test',shuffle=True)
X_test_count = count_vec.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_count)
predictions = clf.predict(X_test_tfidf)

# check model performance on testing data

np.mean(predictions==twenty_test.target)

#need to use vectorizer.transform for the test dataset, since the training dataset fixes the vocabulary (you cannot know the full vocabulary including the training set afterall). 
# Just to be clear, thats vectorizer.transform instead of vectorizer.fit_transform

0.7738980350504514

In [48]:
print(X_test_count.shape)
print(X_test_tfidf.shape)
print(len(twenty_test.data))

# So there are 7532 text_lines/ datapoints in testing set.

(7532, 130107)
(7532, 130107)
7532


In [72]:
len(twenty_train.target_names)

# so there are 20 different categories in output

20

In [49]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# Performance of NB Classifier
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [54]:
# training using SVM classifier. Note : in sckikit-learn, linear SVM is written as SGDClassifier

from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
                     # note : hinge loss is used for max-margin classifiers. # penalty : regularization L2 means Ridge # alpha : regulazrization parameter # n_iter : no.of iterations
  
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# performance evaluation
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)
       



0.8238183749336165

In [70]:
predicted_svm

array([ 7,  1,  0, ...,  9,  3, 15])

**We can see that our model accuracy has improved after moving from Naive_bayeś to SVM**

In [63]:
#using grid search to optimize model : Naive Bayes

# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.


from sklearn.grid_search import GridSearchCV

parameter_grid = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

                             # Note : above tuning parameters belong to CountVectorizer, TfidTransformer and MultinomialNB respectively
                             # Note : There should be two underscores between estimator name and it's parameters in a Pipeline tfidf__use_idf
  
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameter_grid, n_jobs=-1)    # Note here we have provided the model object i.e. text_clf to the data which has already training data fit into it in
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)   # previous step. We can provide the object which doesnt have training data fit such as
                                                             # text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
print(gs_clf.best_params_)
print(gs_clf.best_score_)

#The accuracy has now increased to 90.6% for the NB classifier 

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
0.9067526957751458


In [0]:
# using GridSearchCV for SVM classifier

parameter_grid = {'vect__ngram_range' : [(1,1),(1,2)],'tfidf__use_idf' : (True,False),'clf-svm__alpha' : (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm,parameter_grid,n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

print(gs_clf_svm.best_params_)
print(gs_clf_svm.best_score_)

# so here we are achieving 89.79% accuracy

**Step 6: Useful tips and a touch of NLTK.**
1. Removing stop words: (the, then etc) from the data. You should do this only when stop words are not useful for the underlying problem. In most of the text classification problems, this is indeed not useful. Let’s see if removing stop words increases the accuracy. Update the code for creating object of CountVectorizer as follows:

In [68]:
# using stopwords in Naive Bayeś
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


text_clf = Pipeline([('vect',CountVectorizer(stop_words = 'english')),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])

parameter_grid = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf,parameter_grid,n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

print(gs_clf.best_params_)
print(gs_clf.best_score_)

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
0.9057804490012374


In [73]:
# using stopwords in SVM

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


text_clf_svm = Pipeline([('vect',CountVectorizer(stop_words = 'english')),('tfidf',TfidfTransformer()),('clf-svm',SGDClassifier())])

parameter_grid = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm,parameter_grid,n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

print(gs_clf_svm.best_params_)
print(gs_clf_svm.best_score_)

# accuracy is 89.62%



{'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
0.8962347534028637
