In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from mlxtend.evaluate import paired_ttest_5x2cv
from pprint import pprint

In [45]:
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
remove = ('headers', 'footers', 'quotes')

In [46]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)
X = dataset.data
y = dataset.target

In [47]:
vectorizer = CountVectorizer(max_df=1.0,stop_words='english')
X = vectorizer.fit_transform(X)

feature_names = vectorizer.get_feature_names()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify= y,random_state=42)

In [54]:
# clf_lr = LogisticRegression(penalty="l1",random_state=42,max_iter=500,solver='liblinear')
clf_lr = LogisticRegression(penalty="l2",random_state=42,max_iter=500)
clf_lr.fit(X_train, y_train)

train_score = accuracy_score(y_train, clf_lr.predict(X_train))
print("Train accuracy:   %0.5f" % train_score)

pred = clf_lr.predict(X_test)
    
score = accuracy_score(y_test, pred)
print("Test accuracy:   %0.5f" % score)

print("classification report:")
print(classification_report(y_test, pred))

print("confusion matrix:")
print(confusion_matrix(y_test, pred))

Train accuracy:   0.98193
Test accuracy:   0.80501
classification report:
              precision    recall  f1-score   support

           0       0.74      0.72      0.73       264
           1       0.89      0.91      0.90       321
           2       0.82      0.88      0.85       326
           3       0.72      0.63      0.67       207

    accuracy                           0.81      1118
   macro avg       0.79      0.79      0.79      1118
weighted avg       0.80      0.81      0.80      1118

confusion matrix:
[[190  12  22  40]
 [  7 292  19   3]
 [ 16  15 288   7]
 [ 45   9  23 130]]


In [55]:
clf_dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_dt.fit(X_train, y_train)

train_score = accuracy_score(y_train, clf_dt.predict(X_train))
print("Train accuracy:   %0.5f" % train_score)

pred = clf_dt.predict(X_test)
    
score = accuracy_score(y_test, pred)
print("Test accuracy:   %0.5f" % score)

print("classification report:")
print(classification_report(y_test, pred))

print("confusion matrix:")
print(confusion_matrix(y_test, pred))

Train accuracy:   0.98237
Test accuracy:   0.62433
classification report:
              precision    recall  f1-score   support

           0       0.51      0.48      0.50       264
           1       0.76      0.76      0.76       321
           2       0.62      0.70      0.66       326
           3       0.54      0.48      0.51       207

    accuracy                           0.62      1118
   macro avg       0.61      0.60      0.61      1118
weighted avg       0.62      0.62      0.62      1118

confusion matrix:
[[126  28  48  62]
 [ 17 245  52   7]
 [ 48  34 228  16]
 [ 54  15  39  99]]


In [56]:
t, p = paired_ttest_5x2cv(estimator1=clf_lr,
                          estimator2=clf_dt,
                          X=X, y=y,
                          random_seed=42)

print('t statistic: %.9f' % t)
print('p value: %.9f' % p)

t statistic: 10.365098985
p value: 0.000143904
