In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import scipy.io as sio
import math

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.mlab as mlab
from sklearn import preprocessing

In [2]:
data = pd.read_csv('data_banknote_authentication.csv')
df_bna = pd.DataFrame(data)

In [3]:
df_bna

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [4]:
number_authentic = len(data[data.Class == 1])
number_not_authentic = len(data[data.Class == 0])
print('Total cases: {}'.format(number_authentic + number_not_authentic))
print('Number of authenticity: {}'.format(number_authentic))
print('Number of non-authenticity: {}'.format(number_not_authentic))

Total cases: 1372
Number of authenticity: 610
Number of non-authenticity: 762


In [5]:
# Drop the Class (label)
X = df_bna.drop('Class',axis=1)
# Create label
Y = df_bna['Class']

## Partition 2 (80/20 Split)

### SVM

In [6]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size = 0.2 ,random_state = 42)

In [7]:
# Calculate error given feature vectors X and labels Y.
def calc_error(X, Y, classifier):
    Y_pred = classifier.predict(X)   
    e = 1 - accuracy_score(Y, Y_pred) 
    return e

In [8]:
C_list = [0.1, 1, 10, 100, 1000]
opt_e_training = 1.0   # Optimal training error.
opt_classifier = None  # Optimal classifier.
opt_C          = None  # Optimal C.

for C in C_list:
    # Create a linear SVM classifier.
    classifier = svm.LinearSVC(penalty='l2', loss='hinge', C=C, max_iter=10000)
    
    # Use the classifier to fit the training set (use X_train, Y_train).
    classifier.fit(X_train_val, Y_train_val)

    # Obtain the weights and bias from the linear SVM classifier.
    W = classifier.coef_[0]
    b = classifier.intercept_[0]
    
    # Show decision boundary, training error and test error.
    print('C = {}'.format(C))
    print('Decision boundary: {:.3f}x0+{:.3f}x1+{:.3f}=0'.format(W[0],W[1],b))
    #vis(X_train_val, Y_train_val, W, b)
    e_training = calc_error(X_train_val, Y_train_val, classifier)
    print('Training accuracy: {}'.format(1 - e_training))
    print('\n\n\n')
    
    # Judge if it is the optimal one.
    if e_training < opt_e_training:
        opt_e_training = e_training
        opt_classifier = classifier
        opt_C = C

C = 0.1
Decision boundary: -0.944x0+-0.611x1+1.190=0
Training accuracy: 0.99179580674567




C = 1
Decision boundary: -2.250x0+-1.302x1+2.068=0
Training accuracy: 0.9899726526891522




C = 10
Decision boundary: -3.070x0+-1.730x1+2.854=0
Training accuracy: 0.9890610756608933




C = 100
Decision boundary: -6.678x0+-3.397x1+5.914=0
Training accuracy: 0.9908842297174111




C = 1000
Decision boundary: -3.506x0+-1.762x1+3.424=0
Training accuracy: 0.9890610756608933






In [9]:
# Obtain the weights and bias from the best linear SVM classifier.
opt_W = opt_classifier.coef_[0]
opt_b = opt_classifier.intercept_[0]
print('Best parameter C* = {}'.format(opt_C))
print('Decision boundary: {:.3f}x0+{:.3f}x1+{:.3f}=0'.format(opt_W[0],opt_W[1],opt_b))
print('Testing Accuracy: {}'.format(1 - calc_error(X_test, Y_test, opt_classifier)))

Best parameter C* = 0.1
Decision boundary: -0.944x0+-0.611x1+1.190=0
Testing Accuracy: 0.9854545454545455


### Logistic Regression

In [10]:
classifier = LogisticRegressionCV()
param = [
  {'penalty': ['l1'], 'solver': ['liblinear']},
  {'penalty': ['l2'], 'solver': ['newton-cg']},
 ]
logit = GridSearchCV(classifier, param, cv=5, return_train_score=True)
logit.fit(X_train_val, Y_train_val)
logit_train = logit.cv_results_['mean_train_score']
logit_test = logit.cv_results_['mean_test_score']

test_accX = logit.best_estimator_.predict(X_test) == Y_test
print('Best parameter: {}'.format(logit.best_params_))
#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Training accuracy: {}'.format(logit_train))
print('Validation score: {}'.format(logit_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'penalty': 'l1', 'solver': 'liblinear'}
Training accuracy: [0.99179705 0.99361854]
Validation score: [0.99088423 0.98906108]
Testining accuracy: 0.9854545454545455


### Random Forest

In [11]:
param = {"max_features":[0.05,0.1,0.2,0.3,0.4,0.6,0.8,1],"n_estimators":[1024]}
classifier = RandomForestClassifier()

rf = GridSearchCV(classifier,param,cv=5, return_train_score=True)
rf.fit(X_train_val, Y_train_val)
rf_train = rf.cv_results_['mean_train_score']
rf_test = rf.cv_results_['mean_test_score']

test_accX = rf.best_estimator_.predict(X_test) == Y_test

#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Best parameter: {}'.format(rf.best_params_))
print('Training accuracy: {}'.format(rf_train))
print('Validation score: {}'.format(rf_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'max_features': 0.6, 'n_estimators': 1024}
Training accuracy: [1. 1. 1. 1. 1. 1. 1. 1.]
Validation score: [0.99088423 0.99088423 0.98997265 0.98997265 0.98997265 0.99179581
 0.99088423 0.99088423]
Testining accuracy: 0.9927272727272727


### Decision Tree

In [12]:
param  = {"criterion":["entropy"],'max_depth':[1,2,3,4,5]}
classifier = DecisionTreeClassifier()

dt = GridSearchCV(classifier,param,cv=5, return_train_score=True)
dt.fit(X_train_val, Y_train_val)
dt_train = dt.cv_results_['mean_train_score']
dt_test = dt.cv_results_['mean_test_score']

test_accX = dt.best_estimator_.predict(X_test) == Y_test
#print(sum(test_accX))
test_acc = sum(test_accX)/len(test_accX)
print('Best parameter: {}'.format(dt.best_params_))
print('Training accuracy: {}'.format(dt_train))
print('Validation score: {}'.format(dt_test))
print('Testining accuracy: {}'.format(test_acc))

Best parameter: {'criterion': 'entropy', 'max_depth': 5}
Training accuracy: [0.86417476 0.91066458 0.96262512 0.97948951 0.9958979 ]
Validation score: [0.86326345 0.89881495 0.94804011 0.9562443  0.97812215]
Testining accuracy: 0.9781818181818182
