In [2]:
from sklearn.datasets import load_files

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors.nearest_centroid import NearestCentroid

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

### Loading 20_newsgroup files

Load the given text files with categories as subfolder names. The folder names are used as supervised signal label names.

In [None]:
full_dataSet = load_files(container_path="20_newsgroup", shuffle=True, random_state=10, encoding='Latin-1')

### In order to evaluate below metrics are calculated on the corss validation as scores

In [2]:
scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']

### Classifier 1

In [22]:
text_clf1 = Pipeline( [ ('countVect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=True)), ('NBclf', MultinomialNB()) ] )
scores_clf1 = cross_validate(text_clf1, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [23]:
print("train_accuracy=", scores_clf1['train_accuracy'].mean())
print("test_accuracy=", scores_clf1['test_accuracy'].mean())

print("train_f1_macro=", scores_clf1['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf1['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf1['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf1['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf1['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf1['test_recall_macro'].mean())

print("fit_time=", scores_clf1['fit_time'].mean())
print("score_time=", scores_clf1['score_time'].mean())

train_accuracy= 0.9418690535913855
test_accuracy= 0.8871832416208104
train_f1_macro= 0.9408203122437462
test_f1_macro= 0.8852408402480488
train_precision_macro= 0.9416763040952443
test_precision_macro= 0.8881976237769103
train_recall_macro= 0.9418776291755329
test_recall_macro= 0.8872
fit_time= 9.35672061443329
score_time= 3.4392364025115967


### Classifier 2

In [24]:
text_clf2 = Pipeline( [ ('countVect', CountVectorizer()), ('NBclf', MultinomialNB()) ] )
scores_clf2 = cross_validate(text_clf2, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [25]:
print("train_accuracy=", scores_clf2['train_accuracy'].mean())
print("test_accuracy=", scores_clf2['test_accuracy'].mean())

print("train_f1_macro=", scores_clf2['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf2['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf2['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf2['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf2['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf2['test_recall_macro'].mean())

print("fit_time=", scores_clf2['fit_time'].mean())
print("score_time=", scores_clf2['score_time'].mean())

train_accuracy= 0.9397187504605832
test_accuracy= 0.8743814407203601
train_f1_macro= 0.9375412315230023
test_f1_macro= 0.8693842667829548
train_precision_macro= 0.9419030694676357
test_precision_macro= 0.8820766485560754
train_recall_macro= 0.9397276291755328
test_recall_macro= 0.8744
fit_time= 8.606685876846313
score_time= 3.5033013105392454


### Classifier 3

In [28]:
text_clf3 = Pipeline( [ ('tfidfvec', TfidfVectorizer(analyzer='word')), ('NBclf', MultinomialNB()) ] )
scores_clf3 = cross_validate(text_clf3, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [29]:
print("train_accuracy=", scores_clf3['train_accuracy'].mean())
print("test_accuracy=", scores_clf3['test_accuracy'].mean())

print("train_f1_macro=", scores_clf3['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf3['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf3['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf3['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf3['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf3['test_recall_macro'].mean())

print("fit_time=", scores_clf3['fit_time'].mean())
print("score_time=", scores_clf3['score_time'].mean())

train_accuracy= 0.9418690535913855
test_accuracy= 0.8871832416208104
train_f1_macro= 0.9408203122437462
test_f1_macro= 0.8852408402480488
train_precision_macro= 0.9416763040952443
test_precision_macro= 0.8881976237769103
train_recall_macro= 0.9418776291755329
test_recall_macro= 0.8872
fit_time= 8.81294276714325
score_time= 3.5501816272735596


### Classifier 4

In [30]:
text_clf4 = Pipeline( [ ('tfidfVect', TfidfVectorizer()), ('linearSvc', LinearSVC()) ] )
scores_clf4 = cross_validate(text_clf4, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [31]:
print("train_accuracy=", scores_clf4['train_accuracy'].mean())
print("test_accuracy=", scores_clf4['test_accuracy'].mean())

print("train_f1_macro=", scores_clf4['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf4['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf4['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf4['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf4['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf4['test_recall_macro'].mean())

print("fit_time=", scores_clf4['fit_time'].mean())
print("score_time=", scores_clf4['score_time'].mean())

train_accuracy= 0.9759352784550904
test_accuracy= 0.9370403701850926
train_f1_macro= 0.9759836515805113
test_f1_macro= 0.9369180772790484
train_precision_macro= 0.9764299582512532
test_precision_macro= 0.9371976738693663
train_recall_macro= 0.9759388888888887
test_recall_macro= 0.937049494949495
fit_time= 16.019557404518128
score_time= 3.7236266136169434


### Classifier 5

In [9]:
text_clf5 = Pipeline( [ ('tfidfvec', TfidfVectorizer(analyzer='char')), ('NBclf', MultinomialNB()) ] )
scores_clf5 = cross_validate(text_clf5, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [32]:
print("train_accuracy=", scores_clf5['train_accuracy'].mean())
print("test_accuracy=", scores_clf5['test_accuracy'].mean())

print("train_f1_macro=", scores_clf5['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf5['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf5['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf5['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf5['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf5['test_recall_macro'].mean())

print("fit_time=", scores_clf5['fit_time'].mean())
print("score_time=", scores_clf5['score_time'].mean())

train_accuracy= 0.19597383601666202
test_accuracy= 0.18897838919459728
train_f1_macro= 0.16936532680263378
test_f1_macro= 0.16160248825482093
train_precision_macro= 0.21635136459945636
test_precision_macro= 0.20493725417735206
train_recall_macro= 0.19604833922197146
test_recall_macro= 0.18905656565656567
fit_time= 11.989854454994202
score_time= 5.331757664680481


### Classifier 6

In [34]:
text_clf6 = Pipeline( [ ('hashing', HashingVectorizer()), ('linearSvc', LinearSVC()) ] )
scores_clf6 = cross_validate(text_clf6, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [35]:
print("train_accuracy=", scores_clf6['train_accuracy'].mean())
print("test_accuracy=", scores_clf6['test_accuracy'].mean())

print("train_f1_macro=", scores_clf6['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf6['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf6['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf6['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf6['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf6['test_recall_macro'].mean())

print("fit_time=", scores_clf6['fit_time'].mean())
print("score_time=", scores_clf6['score_time'].mean())

train_accuracy= 0.9730848527723468
test_accuracy= 0.9446918459229614
train_f1_macro= 0.9731729550177823
test_f1_macro= 0.9446844920405553
train_precision_macro= 0.9739308867445523
test_precision_macro= 0.9452997958651478
train_recall_macro= 0.973088888888889
test_recall_macro= 0.9446989898989899
fit_time= 19.661931252479555
score_time= 3.9251967668533325


### Classifier 7

In [36]:
text_clf7 = Pipeline( [ ('hashvec', HashingVectorizer()), ('centroid', NearestCentroid(metric='euclidean')) ] )
scores_clf7 = cross_validate(text_clf7, full_dataSet.data, full_dataSet.target, cv=10, scoring=scoring, return_train_score=True)

In [37]:
print("train_accuracy=", scores_clf7['train_accuracy'].mean())
print("test_accuracy=", scores_clf7['test_accuracy'].mean())

print("train_f1_macro=", scores_clf7['train_f1_macro'].mean())
print("test_f1_macro=", scores_clf7['test_f1_macro'].mean())

print("train_precision_macro=", scores_clf7['train_precision_macro'].mean())
print("test_precision_macro=", scores_clf7['test_precision_macro'].mean())

print("train_recall_macro=", scores_clf7['train_recall_macro'].mean())
print("test_recall_macro=", scores_clf7['test_recall_macro'].mean())

print("fit_time=", scores_clf7['fit_time'].mean())
print("score_time=", scores_clf7['score_time'].mean())

train_accuracy= 0.5687019832292554
test_accuracy= 0.5537827163581792
train_f1_macro= 0.5863026697447025
test_f1_macro= 0.5702005176723838
train_precision_macro= 0.6552573055248448
test_precision_macro= 0.6410146078654272
train_recall_macro= 0.5687521961351003
test_recall_macro= 0.5538363636363636
fit_time= 8.434804129600526
score_time= 5.5612159967422485
