In [52]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import metrics as m
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score 
import pandas as pd

adults_raw = pd.read_csv("./Adult/adult.data", header=None)
enc = OrdinalEncoder(categories='auto', dtype=int)
enc.fit(adults_raw)
adults=enc.transform(adults_raw)
adults_input = adults[:,0:14]
adults_input
adults_target = adults[:,14]
adults_target

array([0, 0, 0, ..., 0, 0, 1])

In [53]:
tclf = tree.DecisionTreeClassifier()
tclf.fit(adults_input,adults_target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [54]:
adults_raw_test = pd.read_csv("./Adult/adult.test", header=None)
enc = OrdinalEncoder(categories='auto', dtype=int)
enc.fit(adults_raw_test)
adults_test=enc.transform(adults_raw_test)
adults_input_test = adults_test[:,0:14]
adults_input_test
adults_target_test = adults_test[:,14]
adults_target_test

array([0, 0, 1, ..., 0, 0, 1])

In [55]:
tclf.score(adults_input_test,adults_target_test)
adults_test_predicted = tclf.predict(adults_input_test)
cnf_matrix = m.confusion_matrix(adults_test_predicted, adults_target_test)
print(cnf_matrix)
print(m.accuracy_score(adults_target_test, adults_test_predicted))
print(m.precision_score(adults_target_test, adults_test_predicted))
print(m.recall_score(adults_target_test, adults_test_predicted))
print(m.f1_score(adults_target_test, adults_test_predicted))

[[10798  1600]
 [ 1637  2246]]
0.8011792887414778
0.5784187483904197
0.5839833593343734
0.5811877345064044


In [56]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(adults_input,adults_target)
gnb.score(adults_input_test,adults_target_test)
gnb_adults_test_predicted = gnb.predict(adults_input_test)
cnf_matrix = m.confusion_matrix(gnb_adults_test_predicted, adults_target_test)
print(cnf_matrix)
print(m.accuracy_score(adults_target_test, gnb_adults_test_predicted))
print(m.precision_score(adults_target_test, gnb_adults_test_predicted))
print(m.recall_score(adults_target_test, gnb_adults_test_predicted))
print(m.f1_score(adults_target_test, gnb_adults_test_predicted))

[[11627  2106]
 [  808  1740]]
0.821018364965297
0.6828885400313972
0.45241809672386896
0.5442602439787301


In [57]:
from sklearn.svm import SVC
svcl = SVC(gamma='scale')
svcl.fit(adults_input,adults_target)
svcl.score(adults_input_test,adults_target_test)
svcl_adults_test_predicted = svcl.predict(adults_input_test)
cnf_matrix = m.confusion_matrix(svcl_adults_test_predicted, adults_target_test)
print(cnf_matrix)
print(m.accuracy_score(adults_target_test, svcl_adults_test_predicted))
print(m.precision_score(adults_target_test, svcl_adults_test_predicted))
print(m.recall_score(adults_target_test, svcl_adults_test_predicted))
print(m.f1_score(adults_target_test, svcl_adults_test_predicted))

[[12205  3044]
 [  230   802]]
0.7989067010625883
0.7771317829457365
0.20852834113364535
0.3288232882328823


-----------

In [58]:
internet_raw = pd.read_csv('./internet/internet.txt', sep=' ',header=None)
enc = OrdinalEncoder(categories='auto', dtype=int)
enc.fit(internet_raw)
internet=enc.transform(internet_raw)
internet_input = internet[:,0:4]
internet_target = internet[:,4]

In [59]:
tclf = tree.DecisionTreeClassifier()
tclf.fit(internet_input,internet_target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [81]:
enc = LabelBinarizer()
enc.fit(internet_target)
internet_target_bin=enc.transform(internet_target)
cvs_tree=cross_val_score(tclf,internet_input, internet_target_bin, scoring='accuracy', cv=5)
score=sum(cvs_tree)/len(cvs_tree)
print('Avg. 5 fold cross validation accuracy: '+str(score))

cvs_tree=cross_val_score(tclf,internet_input, internet_target_bin, scoring='precision', cv=5)
score=sum(cvs_tree)/len(cvs_tree)
print('Avg. 5 fold cross validation precision: '+str(score))

cvs_tree=cross_val_score(tclf,internet_input, internet_target_bin, scoring='recall', cv=5)
score=sum(cvs_tree)/len(cvs_tree)
print('Avg. 5 fold cross validation recall: '+str(score))

cvs_tree=cross_val_score(tclf,internet_input, internet_target_bin, scoring='f1', cv=5)
score=sum(cvs_tree)/len(cvs_tree)
print('Avg. 5 fold cross validation F1-score: '+str(score))

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
y_pred = cross_val_predict(tclf, internet_input, internet_target, cv=5)
cnf = confusion_matrix(internet_target, y_pred)
print('Confusion matrix')
print(cnf)

Avg. 5 fold cross validation accuracy: 0.9938461538461538
Avg. 5 fold cross validation precision: 0.9818181818181818
Avg. 5 fold cross validation recall: 1.0
Avg. 5 fold cross validation F1-score: 0.9904761904761905
Confusion matrix
[[225   0]
 [  0  97]]
