## Import packages


In [1]:
!pip install eli5



In [2]:
import eli5
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



# Compare Logistic Regression and Decision Tree

## Prepare dataset and Pick two classes


In [3]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'),)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'),)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


## Compare Logistic Regression and Decision Tree models


In [4]:
lr_model = LogisticRegression(C=1, solver='newton-cg')
lr_features = CountVectorizer() 
lr_classifier = make_pipeline(lr_features, lr_model)
lr_classifier.fit(train.data, train.target)

dt_model = DecisionTreeClassifier(min_samples_split=0.4)
dt_features = CountVectorizer()
dt_classifier = make_pipeline(dt_features, dt_model)
dt_classifier.fit(train.data, train.target)

#Compare accuracy of the two models
lr_train_preds = lr_classifier.predict(train.data)
lr_train_f1 = f1_score(train.target, lr_train_preds, average='micro')
lr_test_preds = lr_classifier.predict(test.data)
lr_test_f1 = f1_score(test.target, lr_test_preds, average='micro')
print("Train/test F1 for Logistic Regression: ", lr_train_f1, lr_test_f1)

dt_train_preds = dt_classifier.predict(train.data)
dt_train_f1 = f1_score(train.target, dt_train_preds, average='micro')
dt_test_preds = dt_classifier.predict(test.data)
dt_test_f1 = f1_score(test.target, dt_test_preds, average='micro')
print("Train/test F1 for Decision Tree: ", dt_train_f1, dt_test_f1)

Train/test F1 for Logistic Regression:  0.9897260273972602 0.8095238095238095
Train/test F1 for Decision Tree:  0.7722602739726028 0.7400257400257402


In [5]:
eli5.show_weights(lr_classifier, top=20, target_names=test.target_names)

Weight?,Feature
+1.980,mac
+1.555,apple
+1.055,centris
+1.011,quadra
+0.882,se
+0.825,lc
+0.780,nubus
+0.772,want
+0.760,adb
… 6975 more positive …,… 6975 more positive …


In [6]:
eli5.show_weights(dt_classifier, top=10, target_names=test.target_names)


Weight,Feature
0.2434,mac
0.1993,apple
0.0832,controller
0.0795,dos
0.0619,pc
0.0531,quadra
0.0435,windows
0.0361,set
0.0356,card
0.0313,vlb


In [7]:
idx = 2
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(lr_model, test.data[idx], vec=lr_features, target_names=test.target_names)

comp.sys.ibm.pc.hardware


Contribution?,Feature
1.021,com
0.825,ibm
0.622,irq
0.617,card
0.514,manual
0.401,of
0.329,offer
0.317,yes
0.294,have
0.265,uses


In [8]:
eli5.show_prediction(dt_model, test.data[idx], vec=dt_features, target_names=test.target_names)

Contribution?,Feature
0.505,<BIAS>
0.31,card
0.068,mac
0.052,apple
0.019,quadra
0.014,se
0.013,nubus
0.013,centris
0.012,powerbook
-0.014,vlb


# Ensemble Methods

In [9]:
from sklearn.ensemble import VotingClassifier

features = CountVectorizer()

lr_model = LogisticRegression(C=1, solver='lbfgs')
lr_classifier = make_pipeline(features, lr_model)
lr_classifier.fit(train.data, train.target)

# Using for loop I found that 0.1 give the best F1 Score
dt_model = DecisionTreeClassifier(min_samples_split=0.1)
dt_classifier = make_pipeline(features, dt_model)
dt_classifier.fit(train.data, train.target)

#Compare accuracy of the two models
lr_train_preds = lr_classifier.predict(train.data)
lr_train_f1 = f1_score(train.target, lr_train_preds, average='micro')
lr_test_preds = lr_classifier.predict(test.data)
lr_test_f1 = f1_score(test.target, lr_test_preds, average='micro')
print("Train/test F1 for Logistic Regression: ", lr_train_f1, lr_test_f1)

dt_train_preds = dt_classifier.predict(train.data)
dt_train_f1 = f1_score(train.target, dt_train_preds, average='micro')
dt_test_preds = dt_classifier.predict(test.data)
dt_test_f1 = f1_score(test.target, dt_test_preds, average='micro')
print("Train/test F1 for Decision Tree: ", dt_train_f1, dt_test_f1)

#Look at classifier agreement
print("\n% Cases where the two classifiers agree on test data: ", np.sum(lr_test_preds == dt_test_preds)/len(lr_test_preds))
print("% Cases where one of the two classifiers has correct answer: ", np.sum(np.logical_or(lr_test_preds == test.target, dt_test_preds == test.target)/len(lr_test_preds)))

import random
random.seed(123)

#Using for loop I found that 2,1 give the best F1 score

ensemble_classifier = make_pipeline(lr_features, VotingClassifier(estimators=[('lr', lr_model), ('dt', dt_model)], voting='soft', weights=[2,1]))
ensemble_classifier.fit(train.data, train.target)
    
ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)


Train/test F1 for Logistic Regression:  0.9897260273972602 0.8082368082368082
Train/test F1 for Decision Tree:  0.9058219178082192 0.7799227799227799

% Cases where the two classifiers agree on test data:  0.7915057915057915
% Cases where one of the two classifiers has correct answer:  0.8983268983268983

Train/test F1 for Ensemble:  0.9888698630136986 0.8314028314028314


## Bagging

In [10]:
from sklearn.ensemble import RandomForestClassifier

import random
random.seed(123)

# I used for loop to determine the best n_estimateor and min_samples_split which are 530, and 0.2

ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=530, min_samples_split=0.2))
ensemble_classifier.fit(train.data, train.target)
ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.9434931506849316 0.8365508365508365


## Boosting

In [11]:
from sklearn.ensemble import AdaBoostClassifier
# It shows that when n_estimators is 200 and learning_rate = 1.0
random.seed(123)
for i in range(100,600,100):
    ensemble_classifier = make_pipeline(lr_features, AdaBoostClassifier(n_estimators=i, learning_rate=1.0))
    ensemble_classifier.fit(train.data, train.target)
    
    ensemble_train_preds = ensemble_classifier.predict(train.data)
    ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
    ensemble_test_preds = ensemble_classifier.predict(test.data)
    ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
    print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.9392123287671232 0.833976833976834

Train/test F1 for Ensemble:  0.988013698630137 0.8223938223938224

Train/test F1 for Ensemble:  0.9897260273972602 0.8262548262548263

Train/test F1 for Ensemble:  0.9897260273972602 0.8223938223938224

Train/test F1 for Ensemble:  0.9897260273972602 0.821106821106821


In [12]:
from sklearn.ensemble import GradientBoostingClassifier



# The best parameter that has is n_estimators of 400 and min_samples_split of 0.1 which gives a F1 score(test) of 0.846
for i in range(300,500,10):
    ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=i, min_samples_split=0.3))
    ensemble_classifier.fit(train.data, train.target)
    
    ensemble_train_preds = ensemble_classifier.predict(train.data)
    ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
    ensemble_test_preds = ensemble_classifier.predict(test.data)
    ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
    print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.9837328767123288 0.8365508365508365

Train/test F1 for Ensemble:  0.9837328767123288 0.8352638352638352

Train/test F1 for Ensemble:  0.9828767123287672 0.833976833976834

Train/test F1 for Ensemble:  0.9845890410958904 0.8404118404118404

Train/test F1 for Ensemble:  0.9837328767123288 0.8314028314028314

Train/test F1 for Ensemble:  0.9845890410958904 0.833976833976834

Train/test F1 for Ensemble:  0.985445205479452 0.8301158301158301

Train/test F1 for Ensemble:  0.9837328767123288 0.8352638352638352

Train/test F1 for Ensemble:  0.9837328767123288 0.8378378378378378

Train/test F1 for Ensemble:  0.9837328767123288 0.8326898326898327

Train/test F1 for Ensemble:  0.9828767123287672 0.8352638352638352

Train/test F1 for Ensemble:  0.9845890410958904 0.833976833976834

Train/test F1 for Ensemble:  0.9863013698630136 0.833976833976834

Train/test F1 for Ensemble:  0.9845890410958904 0.8288288288288288

Train/test F1 for Ensemble:  0.985445205479452 0.836

In [13]:
from sklearn.ensemble import GradientBoostingClassifier


# The best parameter that has is n_estimators of 210 and min_samples_split of 0.1 which gives a F1 score(test) of 0.832


ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=210, min_samples_split=0.1))
ensemble_classifier.fit(train.data, train.target)
    
ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.9683219178082192 0.8352638352638352


# Comparing Bagging and Boosting

In [14]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=n_est, min_samples_split=0.05))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

50 Train/test F1 for Ensemble:  0.976027397260274 0.8288288288288288
100 Train/test F1 for Ensemble:  0.978595890410959 0.8198198198198198
150 Train/test F1 for Ensemble:  0.9768835616438356 0.8352638352638352
200 Train/test F1 for Ensemble:  0.9794520547945206 0.8314028314028314
250 Train/test F1 for Ensemble:  0.9794520547945206 0.8391248391248392
300 Train/test F1 for Ensemble:  0.9794520547945206 0.8378378378378378
350 Train/test F1 for Ensemble:  0.9794520547945206 0.8352638352638352
400 Train/test F1 for Ensemble:  0.978595890410959 0.8352638352638352
450 Train/test F1 for Ensemble:  0.9794520547945206 0.8416988416988417


In [15]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=n_est, min_samples_split=0.5))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

50 Train/test F1 for Ensemble:  0.839041095890411 0.7644787644787645
100 Train/test F1 for Ensemble:  0.8929794520547946 0.8018018018018018
150 Train/test F1 for Ensemble:  0.8955479452054794 0.821106821106821
200 Train/test F1 for Ensemble:  0.8929794520547946 0.821106821106821
250 Train/test F1 for Ensemble:  0.8955479452054794 0.8236808236808236
300 Train/test F1 for Ensemble:  0.8921232876712328 0.8365508365508365
350 Train/test F1 for Ensemble:  0.8981164383561644 0.8236808236808236
400 Train/test F1 for Ensemble:  0.9032534246575342 0.8301158301158301
450 Train/test F1 for Ensemble:  0.9049657534246576 0.8185328185328186


In [16]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=n_est, min_samples_split=0.05))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

50 Train/test F1 for Ensemble:  0.8407534246575342 0.7953667953667954
100 Train/test F1 for Ensemble:  0.9238013698630136 0.8198198198198198
150 Train/test F1 for Ensemble:  0.9511986301369864 0.8365508365508365
200 Train/test F1 for Ensemble:  0.9708904109589042 0.8404118404118404
250 Train/test F1 for Ensemble:  0.9845890410958904 0.8326898326898327
300 Train/test F1 for Ensemble:  0.9863013698630136 0.8378378378378378
350 Train/test F1 for Ensemble:  0.9863013698630136 0.8365508365508365
400 Train/test F1 for Ensemble:  0.9863013698630136 0.8416988416988417
450 Train/test F1 for Ensemble:  0.9863013698630136 0.8391248391248392


In [17]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=n_est, min_samples_split=0.5))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

50 Train/test F1 for Ensemble:  0.8210616438356164 0.7850707850707851
100 Train/test F1 for Ensemble:  0.877568493150685 0.8095238095238095
150 Train/test F1 for Ensemble:  0.9375 0.8301158301158301
200 Train/test F1 for Ensemble:  0.9503424657534246 0.8378378378378378
250 Train/test F1 for Ensemble:  0.9803082191780822 0.8365508365508365
300 Train/test F1 for Ensemble:  0.9828767123287672 0.8314028314028314
350 Train/test F1 for Ensemble:  0.9837328767123288 0.8352638352638352
400 Train/test F1 for Ensemble:  0.9837328767123288 0.8391248391248392
450 Train/test F1 for Ensemble:  0.9845890410958904 0.8275418275418276
