In [29]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from mlxtend.evaluate import paired_ttest_5x2cv
import statsmodels.api as sm
import numpy as np

In [30]:
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]

# categories = ['alt.atheism',
#  'comp.graphics',
#  'comp.os.ms-windows.misc',
#  'comp.sys.ibm.pc.hardware',
#  'comp.sys.mac.hardware']
remove = ('headers', 'footers', 'quotes')

In [32]:
dataset = fetch_20newsgroups(subset='all', categories=None,
                                shuffle=True, random_state=42,
                                remove=remove)
X = dataset.data
y = dataset.target

In [33]:
vectorizer = CountVectorizer(max_df=1.0,stop_words='english')
X = vectorizer.fit_transform(X)

feature_names = vectorizer.get_feature_names()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify= y,random_state=42)

In [35]:
clf_lr = LogisticRegression(penalty="l2",random_state=42,max_iter=500,solver='liblinear')
# clf_lr = LogisticRegression(penalty="l2",random_state=42,max_iter=500)
clf_lr.fit(X_train, y_train)

print('LOGISTIC REGRESSION: ')
print()
train_score = accuracy_score(y_train, clf_lr.predict(X_train))
print("Train accuracy:   %0.5f" % train_score)

pred = clf_lr.predict(X_test)
    
score = accuracy_score(y_test, pred)
print("Test accuracy:   %0.5f" % score)

print("classification report:")
print(classification_report(y_test, pred))

print("confusion matrix:")
print(confusion_matrix(y_test, pred))

LOGISTIC REGRESSION: 

Train accuracy:   0.96982
Test accuracy:   0.70096
classification report:
              precision    recall  f1-score   support

           0       0.60      0.56      0.58       263
           1       0.68      0.68      0.68       321
           2       0.63      0.68      0.65       325
           3       0.64      0.65      0.65       324
           4       0.71      0.66      0.68       318
           5       0.82      0.79      0.81       326
           6       0.77      0.71      0.74       322
           7       0.47      0.80      0.60       327
           8       0.73      0.71      0.72       329
           9       0.77      0.78      0.78       328
          10       0.87      0.84      0.85       330
          11       0.83      0.77      0.80       327
          12       0.65      0.71      0.68       325
          13       0.82      0.79      0.80       327
          14       0.77      0.73      0.75       326
          15       0.69      0.73     

In [36]:
clf_dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_dt.fit(X_train, y_train)

print('DECISION TREE: ')
print()

train_score = accuracy_score(y_train, clf_dt.predict(X_train))
print("Train accuracy:   %0.5f" % train_score)

pred = clf_dt.predict(X_test)
    
score = accuracy_score(y_test, pred)
print("Test accuracy:   %0.5f" % score)

print("classification report:")
print(classification_report(y_test, pred))

print("confusion matrix:")
print(confusion_matrix(y_test, pred))

DECISION TREE: 

Train accuracy:   0.97378
Test accuracy:   0.38312
classification report:
              precision    recall  f1-score   support

           0       0.24      0.27      0.26       263
           1       0.31      0.31      0.31       321
           2       0.43      0.42      0.42       325
           3       0.27      0.29      0.28       324
           4       0.43      0.40      0.41       318
           5       0.44      0.48      0.46       326
           6       0.52      0.45      0.48       322
           7       0.25      0.51      0.34       327
           8       0.47      0.39      0.42       329
           9       0.42      0.39      0.40       328
          10       0.49      0.55      0.52       330
          11       0.53      0.43      0.47       327
          12       0.21      0.20      0.21       325
          13       0.36      0.35      0.35       327
          14       0.41      0.41      0.41       326
          15       0.47      0.43      0.45 

In [37]:
t, p = paired_ttest_5x2cv(estimator1=clf_lr,
                          estimator2=clf_dt,
                          X=X, y=y,
                          random_seed=42)

print('t statistic: %.9f' % t)
print('p value: %.9f' % p)

t statistic: 35.790192995
p value: 0.000000321
