In [17]:
#import 20newsgroups datasets from sklearn
import numpy
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',remove=(['headers','footers', 'quotes']))

In [18]:
#list of categories(labels)
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
#counts the occurence of each word. 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
# count_vect.vocabulary_.get(u'algorithm')

(11314, 101631)

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
#transform from occurrences to tf-idf 
#tf-idf: 1. Decide the number of occurrences of each word in a document by total number of words in document. (Term Frequences tf)
#        2. It does another refinement. Downscaling weights for words that occurring in many documents.
tfidf_transformer = TfidfTransformer()

#use X_train_tfidf to train the MODEL
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [21]:
# Do the normalization of X_train_counts, and it could be used later on.
from sklearn.preprocessing import Normalizer

normalizer_train = Normalizer().fit(X=X_train_counts)
Xtraincounts_normalized = normalizer_train.transform(X_train_counts)

In [22]:
 #   Training a classifier example:
 #   from sklearn.naive_bayes import MultinomialNB
 #   clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
    
 # Here we use decision trees
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_tfidf, twenty_train.target)


In [23]:
# Building a pipeline that behaves like a compound classifier
from sklearn import tree
from sklearn.pipeline import Pipeline
#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('dt', tree.DecisionTreeClassifier())])
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('dt', tree.DecisionTreeClassifier())])

In [28]:
# Hyperparameter tuning using Randomized search

import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
X = twenty_train.data
Y = twenty_train.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)
# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [45,65,95,125]
#max_depth = [4,8,12]

n = len(X)
idx1 = numpy.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf,
                      dt__criterion=criterion,
                      dt__max_depth=max_depth);
# Call RandomizedSearchCV
rs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf.fit(X, Y)
# View Best Parameters
print('Best n-gram range:', rs_clf.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best criterion:', rs_clf.best_estimator_.get_params()['dt__criterion'])
print('Best max_depth:', rs_clf.best_estimator_.get_params()['dt__max_depth'])
print(); print(rs_clf.best_estimator_.get_params()['dt'])






Best n-gram range: (1, 2)
Best use_idf: False
Best criterion: gini
Best max_depth: 95

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=95, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [None]:
#  Predict an outcome on new document:
#  Doc_examples = ['God is love', 'GPU is fast']

#  convert documents into tf-idf.  
#  X_new_counts = count_vect.transform(Doc_examples) 
#  X_new_tfidf = tfidf_transformer.transform(X_new_counts)


#  predicted = clf.predict(X_new_tfidf)   (name_of_your_model_object)
#  it stores the prediction of doc_examples.

# Use Cross Validation To Evaluate Model
#CV_Result = cross_val_score(rs_clf, X, Y, cv=4, n_jobs=-1)
#print(); print('Cross validation result:',CV_Result)
#print(); print('Mean of cross validation result:',CV_Result.mean())
#print(); print('STDEV of cross validation result:',CV_Result.std())


In [29]:
# Let's see the prediction of the test set

import numpy as np
from sklearn import metrics
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers','footers', 'quotes']), shuffle=True)
data_test = twenty_test.data
predicted = rs_clf.predict(data_test)
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))
#text_clf.fit(X,Y)
#predicted = text_clf.predict(data_test)
#print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))



                          precision    recall  f1-score   support

             alt.atheism       0.30      0.18      0.23       319
           comp.graphics       0.34      0.42      0.38       389
 comp.os.ms-windows.misc       0.54      0.41      0.47       394
comp.sys.ibm.pc.hardware       0.44      0.29      0.35       392
   comp.sys.mac.hardware       0.56      0.36      0.44       385
          comp.windows.x       0.65      0.39      0.49       395
            misc.forsale       0.61      0.52      0.56       390
               rec.autos       0.15      0.67      0.24       396
         rec.motorcycles       0.79      0.43      0.56       398
      rec.sport.baseball       0.57      0.40      0.47       397
        rec.sport.hockey       0.65      0.54      0.59       399
               sci.crypt       0.69      0.43      0.53       396
         sci.electronics       0.38      0.19      0.26       393
                 sci.med       0.22      0.56      0.32       396
         