# 20 Newspaper dataset

In [4]:
#import 20newsgroups datasets from sklearn
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',remove=(['headers','footers', 'quotes']))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
#list of categories(labels)
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

#counts the occurence of each word. 
count_vect = CountVectorizer()

#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts2 = count_vect2.fit_transform(twenty_train.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

# count_vect.vocabulary_.get(u'algorithm')

(11314, 101631)
(11314, 101322)


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
#transform from occurrences to tf-idf 
#tf-idf: 1. Decide the number of occurrences of each word in a document by total number of words in document. (Term Frequences tf)
#        2. It does another refinement. Downscaling weights for words that occurring in many documents.
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()

#use X_train_tfidf to train the MODEL
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

X_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
docs_test = X_test.data
#use docs_test to fit.
print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)

(11314, 101631)
(11314, 101322)


In [0]:
import pandas as pd

# get the first document
first_vector=X_train_tfidf[13]
first_vector2=X_train_tfidf2[13]
 
# show the TF-IDF scores , compare with/without stopwords
df = pd.DataFrame(first_vector.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"])
df2 = pd.DataFrame(first_vector2.T.todense(), index=count_vect2.get_feature_names(), columns=["tfidf_no_stopwords"])

In [9]:
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
ssf,0.356347
flights,0.298625
option,0.242772
capability,0.242602
the,0.216754
...,...
discern,0.000000
discarded,0.000000
discard,0.000000
discarcina,0.000000


In [10]:
df2.sort_values(by=["tfidf_no_stopwords"],ascending=False)

Unnamed: 0,tfidf_no_stopwords
ssf,0.383239
flights,0.321161
option,0.261093
capability,0.260910
module,0.224627
...,...
disappoint,0.000000
disappering,0.000000
disappears,0.000000
disappearing,0.000000


# Models

## Logistic Regression

In [0]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression(multi_class = 'multinomial')),])
text_clf_lr.fit(twenty_train.data, twenty_train.target)
# clf_lr = LogisticRegression().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,


### Hyperparameter Tuning

In [0]:
# Hyperparameter tuning using Randomized search
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = twenty_train.data
Y = twenty_train.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)

# Create lists of parameter for Logistic Regression Classifier:
penalty = ['l1', 'l2', 'elasticnet']
C = [0.001, 0.01, 0.1, 1, 10, 100]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
solver = ['sag', 'saga']


n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_lr__penalty = penalty, clf_lr__C = C, clf_lr__class_weight = class_weight, clf_lr__solver = solver);
# Call RandomizedSearchCV
rs_clf_lr = RandomizedSearchCV(text_clf_lr, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_lr.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_lr.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_lr.best_estimator_.get_params()['tfidf__use_idf'])
print('Best penalty:', rs_clf_lr.best_estimator_.get_params()['clf_lr__penalty'])
print('Best C:', rs_clf_lr.best_estimator_.get_params()['clf_lr__C'])
print('Best class_weight:', rs_clf_lr.best_estimator_.get_params()['clf_lr__class_weight'])
print('Best solver:', rs_clf_lr.best_estimator_.get_params()['clf_lr__solver'])
print(); print(rs_clf_lr.best_estimator_.get_params()['clf_lr'])



Best n-gram range: (1, 1)
Best use_idf: True
Best penalty: l2
Best C: 1
Best class_weight: None
Best solver: sag

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)


## SVM


In [0]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_svm', SGDClassifier()),])
text_clf_svm.fit(twenty_train.data, twenty_train.target)
# clf_svm = SGDClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

### Hyperparameter Tuning

In [0]:
# Create lists of parameter for SVM Classifier:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1]
learning_rate = ['optimal', 'constant', 'adaptive']
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
eta0 = [0.01, 0.1, 0.5, 1, 100] 

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_svm__alpha = alpha, clf_svm__learning_rate = learning_rate, clf_svm__class_weight = class_weight, clf_svm__eta0 = eta0);
# Call RandomizedSearchCV
rs_clf_svm = RandomizedSearchCV(text_clf_svm, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_svm.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_svm.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_svm.best_estimator_.get_params()['tfidf__use_idf'])
print('Best alpha:', rs_clf_svm.best_estimator_.get_params()['clf_svm__alpha'])
print('Best learning_rate:', rs_clf_svm.best_estimator_.get_params()['clf_svm__learning_rate'])
print('Best class_weight:', rs_clf_svm.best_estimator_.get_params()['clf_svm__class_weight'])
print('Best eta0:', rs_clf_svm.best_estimator_.get_params()['clf_svm__eta0'])
print(); print(rs_clf_svm.best_estimator_.get_params()['clf_svm'])



Best n-gram range: (1, 2)
Best use_idf: True
Best alpha: 1e-05
Best learning_rate: constant
Best class_weight: None
Best eta0: 0.5

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.5, fit_intercept=True,
              l1_ratio=0.15, learning_rate='constant', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


## Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_rf', RandomForestClassifier()),])
text_clf_rf.fit(twenty_train.data, twenty_train.target)
# clf_rf = RandomForestClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Hyperparameter Tuning

In [0]:
# Create lists of parameter for Random Forest Classifier:
# Number of trees in random forest
n_estimators = [10, 100, 500, 750]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [45, 65, 95, 125]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range = ngram_range,
                  tfidf__use_idf = use_idf, clf_rf__n_estimators = n_estimators, clf_rf__max_features = max_features, clf_rf__max_depth=max_depth, clf_rf__min_samples_split = min_samples_split, clf_rf__min_samples_leaf = min_samples_leaf, clf_rf__bootstrap = bootstrap);
# Call RandomizedSearchCV
rs_clf_rf = RandomizedSearchCV(text_clf_rf, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf_rf.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_rf.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_rf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best n_estimators:', rs_clf_rf.best_estimator_.get_params()['clf_rf__n_estimators'])
print('Best max_features:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_features'])
print('Best max_depth:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_depth'])
print('Best min_samples_split:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_split'])
print('Best min_samples_leaf:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_leaf'])
print('Best bootstrap:', rs_clf_rf.best_estimator_.get_params()['clf_rf__bootstrap'])
print(); print(rs_clf_rf.best_estimator_.get_params()['clf_rf'])



Best n-gram range: (1, 2)
Best use_idf: False
Best n_estimators: 750
Best max_features: sqrt
Best max_depth: None
Best min_samples_split: 2
Best min_samples_leaf: 4
Best bootstrap: True

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


## AdaBoost

In [14]:
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', remove=(['headers','footers', 'quotes']))
docs_test = twenty_test.data

from sklearn.ensemble import AdaBoostClassifier
text_clf_ada = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_ada', AdaBoostClassifier())])
text_clf_ada.fit(twenty_train.data, twenty_train.target)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_ada',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=50,

### Hyperparameter tuning

In [13]:
# Hyperparameter tuning using Randomized search
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = twenty_train.data
Y = twenty_train.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
stop_words=[]
# Create parameter for TfidfTransformer
use_idf = (True, False)

n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

#The maximum number of estimators at which boosting is terminated. 
n_estimators=[50, 100, 150, 200]

learning_rate=[0.01,0.1,0.3,1]

#clf_ada__base_estimator = base_estimator
# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_ada__n_estimators = n_estimators, clf_ada__learning_rate = learning_rate);
# Call RandomizedSearchCV
rs_clf_ada = RandomizedSearchCV(text_clf_ada, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_ada.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_ada.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_ada.best_estimator_.get_params()['tfidf__use_idf'])
print('Best n_estimators:', rs_clf_ada.best_estimator_.get_params()['clf_ada__n_estimators'])
print('Best learning_rate:', rs_clf_ada.best_estimator_.get_params()['clf_ada__learning_rate'])
print(); print(rs_clf_ada.best_estimator_.get_params()['clf_ada'])


Best n-gram range: (1, 2)
Best use_idf: False
Best n_estimators: 150
Best learning_rate: 0.3

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.3,
                   n_estimators=150, random_state=None)


## Decision Tree

In [0]:
# Building a pipeline that behaves like a compound classifier
from sklearn import tree
# Do the normalization of X_train_counts, and it could be used later on.
from sklearn.preprocessing import Normalizer

#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('dt', tree.DecisionTreeClassifier())])
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('dt', tree.DecisionTreeClassifier())])

### Hyperparameter Tuning

In [19]:
# Hyperparameter tuning using Randomized search
import numpy
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
X = twenty_train.data
Y = twenty_train.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)
# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [45,65,95,125]
#max_depth = [4,8,12]

n = len(X)
idx1 = numpy.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf,
                      dt__criterion=criterion,
                      dt__max_depth=max_depth);
# Call RandomizedSearchCV
rs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf.fit(X, Y)
# View Best Parameters
print('Best n-gram range:', rs_clf.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best criterion:', rs_clf.best_estimator_.get_params()['dt__criterion'])
print('Best max_depth:', rs_clf.best_estimator_.get_params()['dt__max_depth'])
print(); print(rs_clf.best_estimator_.get_params()['dt'])

Best n-gram range: (1, 2)
Best use_idf: False
Best criterion: gini
Best max_depth: 95

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=95, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


## Evaluation:

*   Logistic Regression
*   SVM
*   Random Forest
*   Adaboost
*   Decision Tree







In [0]:
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers','footers', 'quotes']))
docs_test = twenty_test.data

print("Logistic Regresssion:")
predicted_lr = rs_clf_lr.predict(docs_test)
print(np.mean(predicted_lr == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_lr, target_names=twenty_test.target_names))

print("SVM:")
predicted_svm = rs_clf_svm.predict(docs_test)
print(np.mean(predicted_svm == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_svm, target_names=twenty_test.target_names))

print("Random Forest:")
predicted_rf = rs_clf_rf.predict(docs_test)
print(np.mean(predicted_rf == twenty_test.target))
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted_rf, target_names=twenty_test.target_names))

Logistic Regresssion:
0.6909187466808284
                          precision    recall  f1-score   support

             alt.atheism       0.49      0.45      0.47       319
           comp.graphics       0.63      0.71      0.67       389
 comp.os.ms-windows.misc       0.66      0.63      0.64       394
comp.sys.ibm.pc.hardware       0.68      0.65      0.66       392
   comp.sys.mac.hardware       0.75      0.69      0.72       385
          comp.windows.x       0.83      0.72      0.77       395
            misc.forsale       0.76      0.79      0.77       390
               rec.autos       0.75      0.71      0.73       396
         rec.motorcycles       0.48      0.81      0.61       398
      rec.sport.baseball       0.81      0.82      0.82       397
        rec.sport.hockey       0.90      0.86      0.88       399
               sci.crypt       0.89      0.67      0.76       396
         sci.electronics       0.56      0.61      0.59       393
                 sci.med       0.7

Adaboost

In [15]:
predicted_ada = rs_clf_ada.predict(twenty_test.data)
print("Adaboost")
print(np.mean(predicted_ada == twenty_test.target))
print(metrics.classification_report(twenty_test.target, predicted_ada, target_names=twenty_test.target_names))


Adaboost
0.4349442379182156
                          precision    recall  f1-score   support

             alt.atheism       0.33      0.29      0.31       319
           comp.graphics       0.50      0.46      0.48       389
 comp.os.ms-windows.misc       0.58      0.35      0.43       394
comp.sys.ibm.pc.hardware       0.53      0.35      0.42       392
   comp.sys.mac.hardware       0.72      0.41      0.52       385
          comp.windows.x       0.74      0.44      0.55       395
            misc.forsale       0.82      0.40      0.54       390
               rec.autos       0.20      0.63      0.30       396
         rec.motorcycles       0.88      0.44      0.58       398
      rec.sport.baseball       0.46      0.56      0.51       397
        rec.sport.hockey       0.85      0.38      0.53       399
               sci.crypt       0.88      0.48      0.62       396
         sci.electronics       0.18      0.50      0.26       393
                 sci.med       0.30      0.51  

Decision Tree

In [20]:
print("Decision Tree:")
predicted_dt = rs_clf.predict(twenty_test.data)
print(np.mean(predicted_dt == twenty_test.target))
print(metrics.classification_report(twenty_test.target, predicted_dt, target_names=twenty_test.target_names))

Decision Tree:
0.39312267657992567
                          precision    recall  f1-score   support

             alt.atheism       0.35      0.18      0.24       319
           comp.graphics       0.35      0.43      0.39       389
 comp.os.ms-windows.misc       0.55      0.42      0.48       394
comp.sys.ibm.pc.hardware       0.43      0.26      0.33       392
   comp.sys.mac.hardware       0.57      0.36      0.44       385
          comp.windows.x       0.61      0.39      0.48       395
            misc.forsale       0.63      0.52      0.57       390
               rec.autos       0.14      0.66      0.24       396
         rec.motorcycles       0.65      0.45      0.53       398
      rec.sport.baseball       0.58      0.39      0.47       397
        rec.sport.hockey       0.66      0.55      0.60       399
               sci.crypt       0.72      0.42      0.53       396
         sci.electronics       0.36      0.20      0.25       393
                 sci.med       0.22     

# Exploration: Data Preprocessing: 
*  Remove tags
*  Lemmatization
*  Stopwords

In [11]:
#unpreprocessed data
print(twenty_train.data[1])

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.


## Re: regex
In Python, a regular expression is denoted as RE (REs, regexes or regex pattern) are embedded through re module.
"re" module included with Python primarily used for string searching and manipulation.
Also used frequently for webpage "Scraping" (extract large amount of data from websites).

We use this package to remove the blank spaces, and br tags of the articles.

In [0]:
import re

Remove_space = re.compile("[.;:!\'?,\"()\[\]]")
Remove_br_tags = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess(articles):
    
    #convert all words into lower cases. Remove 
    articles = [Remove_space.sub("", line.lower()) for line in articles]
    articles = [Remove_br_tags.sub(" ", line) for line in articles]   
    return articles


In [17]:
#preprocessing the train/test data.
twenty_train_clean = preprocess(twenty_train.data)
twenty_test_clean = preprocess(twenty_test.data)

print("Preprocessed Data:")
#print one preprocessed instance
print(twenty_train_clean[1])
print()
print()
print("Original Data:")
#unpreprocessed data
print(twenty_train.data[1])


Preprocessed Data:
a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll please send a brief message detailing
your experiences with the procedure top speed attained cpu rated speed
add on cards and adapters heat sinks hour of usage per day floppy disk
functionality with 800 and 14 m floppies are especially requested

i will be summarizing in the next two days so please add to the network
knowledge base if you have done the clock upgrade and havent answered this
poll thanks


Original Data:
A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowl

## Lemmanization
Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. We would use the lemmatizer method from the nltk package.

In [18]:
#download packages.
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
#lemmenized text.
def Lemmanization(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_twenty_train = Lemmanization(twenty_train_clean)
lemmatized_twenty_test = Lemmanization(twenty_test_clean)

In [26]:
print(lemmatized_twenty_train[1])
print()
print(twenty_train_clean[1])

a fair number of brave soul who upgraded their si clock oscillator have shared their experience for this poll please send a brief message detailing your experience with the procedure top speed attained cpu rated speed add on card and adapter heat sink hour of usage per day floppy disk functionality with 800 and 14 m floppy are especially requested i will be summarizing in the next two day so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll thanks

a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll please send a brief message detailing
your experiences with the procedure top speed attained cpu rated speed
add on cards and adapters heat sinks hour of usage per day floppy disk
functionality with 800 and 14 m floppies are especially requested

i will be summarizing in the next two days so please add to the network
knowledge base if you have done the clock upgrade and havent

## Preprocessed data + Lemmanization Performance
Now we will use the processed train/test data to see if it can improve the test accuracy.
Instead of using GridSearch/RandomisedSearch CV, we would just use the hyperparameters trained in previous models.

### Logistic Regression

evaluation with processed data/original data


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


text_clf_lr2 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_lr2', LogisticRegression())])
text_clf_lr2.fit(lemmatized_twenty_train, twenty_train.target)

text_clf_lr3 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_lr3', LogisticRegression())])
text_clf_lr3.fit(twenty_train.data, twenty_train.target)

print("LR:")
#Predict on clean and preprocess, remove tags, lowering words.  
print("After preprocessing")  
predicted_lr2 = text_clf_lr2.predict(lemmatized_twenty_test)
print(np.mean(predicted_lr2 == twenty_test.target))

#Predict on original test sets. Twenty train  
print("No preprocessing(default)") 
predicted_lr3 = text_clf_lr3.predict(twenty_test.data)
print(np.mean(predicted_lr3 == twenty_test.target))

LR:
After preprocessing
0.6830855018587361
No preprocessing(default)
0.6909187466808284


### Adaboost
evaluation with processed data/original data

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


text_clf_ada2 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_ada2', AdaBoostClassifier())])
text_clf_ada2.fit(lemmatized_twenty_train, twenty_train.target)

text_clf_ada3 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_ada3', AdaBoostClassifier())])
text_clf_ada3.fit(twenty_train.data, twenty_train.target)

print("Adaboost:")
#Predict on clean and preprocess, remove tags, lowering words.  
print("After preprocessing")  
predicted_ada2 = text_clf_ada2.predict(lemmatized_twenty_test)
print(np.mean(predicted_ada2 == twenty_test.target))

#Predict on original test sets. Twenty train  
print("No preprocessing(default)") 
predicted_ada3 = text_clf_ada3.predict(twenty_test.data)
print(np.mean(predicted_ada3 == twenty_test.target))




Adaboost:
After preprocessing
0.37958045671800317
No preprocessing(default)
0.36563993627190655


### Random Forest Classifier
evaluation with processed data/original data

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


text_clf_rf2 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_rf2', RandomForestClassifier())])
text_clf_rf2.fit(lemmatized_twenty_train, twenty_train.target)

text_clf_rf3 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_rf3', RandomForestClassifier())])
text_clf_rf3.fit(twenty_train.data, twenty_train.target)

print("Random Forest Classifier:")
#Predict on clean and preprocess, remove tags, lowering words.  
print("After preprocessing")  
predicted_rf2 = text_clf_rf2.predict(lemmatized_twenty_test)
print(np.mean(predicted_rf2 == twenty_test.target))

#Predict on original test sets. Twenty train  
print("No preprocessing(default)") 
predicted_rf3 = text_clf_rf3.predict(twenty_test.data)
print(np.mean(predicted_rf3 == twenty_test.target))


Random Forest Classifier:
After preprocessing
0.6257302177376527
No preprocessing(default)
0.6263940520446096


### SGDClassifier
evaluation with processed data/original data

In [40]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

text_clf_sgd2 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_sgd2', SGDClassifier())])
text_clf_sgd2.fit(lemmatized_twenty_train, twenty_train.target)

text_clf_sgd3 = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer(use_idf=True)), ('clf_sgd3', SGDClassifier())])
text_clf_sgd3.fit(twenty_train.data, twenty_train.target)

print("SGDClassifier:")
#Predict on clean and preprocess, remove tags, lowering words.  
print("After preprocessing")  
predicted_sgd2 = text_clf_sgd2.predict(lemmatized_twenty_test)
print(np.mean(predicted_sgd2 == twenty_test.target))

#Predict on original test sets. Twenty train  
print("No preprocessing(default)") 
predicted_sgd3 = text_clf_sgd3.predict(twenty_test.data)
print(np.mean(predicted_sgd3 == twenty_test.target))


SGDClassifier:
After preprocessing
0.6964949548592672
No preprocessing(default)
0.6970260223048327
