## Import packages
Make sure you installed ***eli5***, ***sklearn***, ***matplotlib*** and ***numpy*** if you use your local machine

In [7]:
import eli5
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Compare Logistic Regression and Decision Tree

## Prepare dataset and Pick two classes
Your two classes should be similar, but opposite in some sense

In [8]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'),)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'),)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


## Compare Logistic Regression and Decision Tree models


In [9]:
lr_model = LogisticRegression(C=1, solver='newton-cg')
lr_features = CountVectorizer() 
lr_classifier = make_pipeline(lr_features, lr_model)
lr_classifier.fit(train.data, train.target)

dt_model = DecisionTreeClassifier(min_samples_split=0.4)
dt_features = CountVectorizer()
dt_classifier = make_pipeline(dt_features, dt_model)
dt_classifier.fit(train.data, train.target)

#Compare accuracy of the two models
lr_train_preds = lr_classifier.predict(train.data)
lr_train_f1 = f1_score(train.target, lr_train_preds, average='micro')
lr_test_preds = lr_classifier.predict(test.data)
lr_test_f1 = f1_score(test.target, lr_test_preds, average='micro')
print("Train/test F1 for Logistic Regression: ", lr_train_f1, lr_test_f1)

dt_train_preds = dt_classifier.predict(train.data)
dt_train_f1 = f1_score(train.target, dt_train_preds, average='micro')
dt_test_preds = dt_classifier.predict(test.data)
dt_test_f1 = f1_score(test.target, dt_test_preds, average='micro')
print("Train/test F1 for Decision Tree: ", dt_train_f1, dt_test_f1)

Train/test F1 for Logistic Regression:  0.9897260273972602 0.8095238095238095
Train/test F1 for Decision Tree:  0.7722602739726028 0.7400257400257402


In [10]:
eli5.show_weights(lr_classifier, top=20, target_names=test.target_names)

NotImplementedError: transform_feature_names not available for CountVectorizer()

In [None]:
eli5.show_weights(dt_classifier, top=5, target_names=test.target_names)
#Play with the min_samples_split parameter when creating dt_classifier and see how the tree changes
#TODO FOR STUDENT: What happens when to the tree when you modify the min_samples_split_parameter

Weight,Feature
0.2449,mac
0.2005,apple
0.0837,controller
0.0800,dos
0.0622,pc
… 11926 more …,… 11926 more …


In [11]:
idx = 2
x = test.data[idx]
print(x)


[REMAINDER DELETED]

I don't have my copy of the manual with me right now, but I can offer the
following in the interim:

   1)  The card uses port addresses 0x2E0 and 0x2E8 (which are NOT
       configurable).  These addresses, incidentally, were inadvertantly
       omitted from my version of the manual.

   2)  I believe there is a dip that controls whether or not to enable
       IRQ 2 (for CGA or EGA support??!?).

Lance Hartmann (lance%hartmann.austin.ibm.com@ibmpa.awdpa.ibm.com)
               Yes, that IS a '%' (percent sign) in my network address.


In [None]:
idx = 2
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(lr_model, test.data[idx], vec=lr_features, target_names=test.target_names)

comp.sys.ibm.pc.hardware




Contribution?,Feature
4.876,Highlighted in text (sum)
-0.526,<BIAS>


In [None]:
eli5.show_prediction(dt_model, test.data[idx], vec=dt_features, target_names=test.target_names)



Contribution?,Feature
0.505,<BIAS>
0.23,Highlighted in text (sum)
0.068,mac
0.052,apple
0.019,quadra
0.014,se
0.013,powerbook
0.013,centris
0.012,lc
-0.014,vlb


# Ensemble Methods

In [17]:


features = CountVectorizer()

lr_model = LogisticRegression(C=1, solver='lbfgs')
lr_classifier = make_pipeline(features, lr_model)
lr_classifier.fit(train.data, train.target)

#TODO FOR STUDENT: Try playing with the min_samples_split to see how it affect the ensemble score
dt_model = DecisionTreeClassifier(min_samples_split=0.35)
dt_classifier = make_pipeline(features, dt_model)
dt_classifier.fit(train.data, train.target)

#Compare accuracy of the two models
lr_train_preds = lr_classifier.predict(train.data)
lr_train_f1 = f1_score(train.target, lr_train_preds, average='micro')
lr_test_preds = lr_classifier.predict(test.data)
lr_test_f1 = f1_score(test.target, lr_test_preds, average='micro')
print("Train/test F1 for Logistic Regression: ", lr_train_f1, lr_test_f1)

dt_train_preds = dt_classifier.predict(train.data)
dt_train_f1 = f1_score(train.target, dt_train_preds, average='micro')
dt_test_preds = dt_classifier.predict(test.data)
dt_test_f1 = f1_score(test.target, dt_test_preds, average='micro')
print("Train/test F1 for Decision Tree: ", dt_train_f1, dt_test_f1)

#Look at classifier agreement
print("\n% Cases where the two classifiers agree on test data: ", np.sum(lr_test_preds == dt_test_preds)/len(lr_test_preds))
print("% Cases where one of the two classifiers has correct answer: ", np.sum(np.logical_or(lr_test_preds == test.target, dt_test_preds == test.target)/len(lr_test_preds)))

#Try to build an ensemble combing both models
#TODO FOR STUDENT: Modify the weights parameter which give different weight to each of the classifiers
ensemble_classifier = make_pipeline(lr_features, VotingClassifier(estimators=[('lr', lr_model), ('dt', dt_model)], voting = 'soft',  weights=[1,1]))
ensemble_classifier.fit(train.data, train.target) 

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)


Train/test F1 for Logistic Regression:  0.9897260273972602 0.8082368082368082
Train/test F1 for Decision Tree:  0.803082191780822 0.7747747747747747

% Cases where the two classifiers agree on test data:  0.7786357786357786
% Cases where one of the two classifiers has correct answer:  0.9021879021879022

Train/test F1 for Ensemble:  0.976027397260274 0.8404118404118404


## Bagging

In [23]:
from sklearn.ensemble import RandomForestClassifier

#TODO FOR STUDENT: Try playing with n_estimators and min_samples_split
ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=500, min_samples_split=0.3))
ensemble_classifier.fit(train.data, train.target)

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.922945205479452 0.824967824967825


## Boosting

In [24]:
from sklearn.ensemble import AdaBoostClassifier
ensemble_classifier = make_pipeline(lr_features, AdaBoostClassifier(n_estimators=50, learning_rate=0.2))
ensemble_classifier.fit(train.data, train.target)

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.788527397260274 0.7760617760617761


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#TODO FOR STUDENT: Try playing with n_estimators and min_samples_split
ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=50, min_samples_split=0.25))
ensemble_classifier.fit(train.data, train.target)

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.8210616438356164 0.7850707850707851


In [None]:
from xgboost import XGBClassifier

#TODO FOR STUDENT: Try playing with n_estimators and min_samples_split
ensemble_classifier = make_pipeline(lr_features, XGBClassifier(n_estimators=50, max_depth=8))
ensemble_classifier.fit(train.data, train.target)

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)





Train/test F1 for Ensemble:  0.928082191780822 0.8108108108108109


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#TODO FOR STUDENT: Try playing with n_estimators and min_samples_split
ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=100, min_samples_split=0.2))
ensemble_classifier.fit(train.data, train.target)

ensemble_train_preds = ensemble_classifier.predict(train.data)
ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
ensemble_test_preds = ensemble_classifier.predict(test.data)
ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
print("\nTrain/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)



Train/test F1 for Ensemble:  0.877568493150685 0.8095238095238095


# Comparing Bagging and Boosting

In [34]:
for n_est in range(10,200,15):
  ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=n_est, min_samples_split=0.01))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

10 Train/test F1 for Ensemble:  0.9726027397260274 0.7722007722007722
25 Train/test F1 for Ensemble:  0.988013698630137 0.8082368082368082
40 Train/test F1 for Ensemble:  0.9888698630136986 0.824967824967825
55 Train/test F1 for Ensemble:  0.9897260273972602 0.824967824967825
70 Train/test F1 for Ensemble:  0.9888698630136986 0.8301158301158301
85 Train/test F1 for Ensemble:  0.9888698630136986 0.8365508365508365
100 Train/test F1 for Ensemble:  0.9888698630136986 0.8326898326898327
115 Train/test F1 for Ensemble:  0.9888698630136986 0.8532818532818534
130 Train/test F1 for Ensemble:  0.9888698630136986 0.833976833976834
145 Train/test F1 for Ensemble:  0.9888698630136986 0.842985842985843
160 Train/test F1 for Ensemble:  0.9897260273972602 0.8416988416988417
175 Train/test F1 for Ensemble:  0.9897260273972602 0.833976833976834
190 Train/test F1 for Ensemble:  0.9888698630136986 0.8262548262548263


In [38]:
for n_est in range(50,201,10):
  ensemble_classifier = make_pipeline(lr_features, RandomForestClassifier(n_estimators=n_est, min_samples_split=0.25))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

50 Train/test F1 for Ensemble:  0.9212328767123288 0.8275418275418276
60 Train/test F1 for Ensemble:  0.925513698630137 0.8301158301158301
70 Train/test F1 for Ensemble:  0.9315068493150684 0.8223938223938224
80 Train/test F1 for Ensemble:  0.9306506849315068 0.833976833976834
90 Train/test F1 for Ensemble:  0.9238013698630136 0.8275418275418276
100 Train/test F1 for Ensemble:  0.9220890410958904 0.8378378378378378
110 Train/test F1 for Ensemble:  0.9297945205479452 0.8288288288288288
120 Train/test F1 for Ensemble:  0.9272260273972602 0.8352638352638352
130 Train/test F1 for Ensemble:  0.9332191780821918 0.8378378378378378
140 Train/test F1 for Ensemble:  0.922945205479452 0.8326898326898327
150 Train/test F1 for Ensemble:  0.9272260273972602 0.8198198198198198
160 Train/test F1 for Ensemble:  0.9340753424657534 0.8404118404118404
170 Train/test F1 for Ensemble:  0.9323630136986302 0.8262548262548263
180 Train/test F1 for Ensemble:  0.928082191780822 0.8416988416988417
190 Train/test 

In [53]:
for n_est in range(40,401,20):
  ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=n_est, min_samples_split=0.009))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

40 Train/test F1 for Ensemble:  0.8176369863013698 0.7824967824967825
60 Train/test F1 for Ensemble:  0.8544520547945207 0.7966537966537967
80 Train/test F1 for Ensemble:  0.9101027397260274 0.8056628056628057
100 Train/test F1 for Ensemble:  0.9323630136986302 0.8159588159588159
120 Train/test F1 for Ensemble:  0.9383561643835615 0.821106821106821
140 Train/test F1 for Ensemble:  0.9563356164383562 0.8262548262548263
160 Train/test F1 for Ensemble:  0.9606164383561644 0.8223938223938224
180 Train/test F1 for Ensemble:  0.9666095890410958 0.8301158301158301
200 Train/test F1 for Ensemble:  0.976027397260274 0.8236808236808236
220 Train/test F1 for Ensemble:  0.985445205479452 0.824967824967825
240 Train/test F1 for Ensemble:  0.9845890410958904 0.8275418275418276
260 Train/test F1 for Ensemble:  0.985445205479452 0.8262548262548263
280 Train/test F1 for Ensemble:  0.9863013698630136 0.8314028314028314
300 Train/test F1 for Ensemble:  0.9863013698630136 0.8314028314028314
320 Train/test

In [None]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=n_est, min_samples_split=0.5))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)

In [None]:
for n_est in range(50,500,50):
  ensemble_classifier = make_pipeline(lr_features, GradientBoostingClassifier(n_estimators=n_est, min_samples_split=0.05))
  ensemble_classifier.fit(train.data, train.target)

  ensemble_train_preds = ensemble_classifier.predict(train.data)
  ensemble_train_f1 = f1_score(train.target, ensemble_train_preds, average='micro')
  ensemble_test_preds = ensemble_classifier.predict(test.data)
  ensemble_test_f1 = f1_score(test.target, ensemble_test_preds, average='micro')
  print(n_est, "Train/test F1 for Ensemble: ", ensemble_train_f1, ensemble_test_f1)