In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
import pandas as pd
from skimage.transform import rescale, resize, downscale_local_mean
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pickle

In [2]:
digits = datasets.load_digits()

def run_with_hyperparameters(imgs, target, clf_name, iteraction):
    gamma_lst = [0.02, 0.007, 0.003, 0.0009, 0.0001, 0.0006]
    c_lst = [0.1, 0.3, 0.8, 0.7, 2, 0.4] 

    h_params = []
    
    if clf_name == 'SVM':
        h_params = [{'gamma':g, 'C':c} for g,c in zip(gamma_lst,c_lst)]
    else:
        h_params = [{'max_depth': m} for m in [None, 3, 4, 5, 6, 8]]

    train_f = 0.8
    test_f = 0.1
    dev_f = 0.1

    n_samples = len(imgs)
    data = imgs.reshape((n_samples, -1))

    dev_test_f = 1-train_f
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(
        data, target, test_size=dev_test_f, shuffle=True
    )
    X_test, X_dev, y_test, y_dev = train_test_split(
        X_dev_test, y_dev_test, test_size=(dev_f)/dev_test_f, shuffle=True
    )


    best_accuracy = -1.0
    best_model = None
    best_h_params = None

    best_train_accuracy = -1.0
    best_dev_accuracy = -1.0
    best_test_accuracy = -1.0

    h_param_results = []
    for cur_h_params in h_params:
        clf, hyper_params = None, None
        
        if clf_name == 'SVM':
            clf = svm.SVC()
        else:
            clf = DecisionTreeClassifier()
        

        hyper_params = cur_h_params
        clf.set_params(**hyper_params)

        clf.fit(X_train, y_train)

        predicted_dev = clf.predict(X_dev)
        predicted_train = clf.predict(X_train)
        predicted_test = clf.predict(X_test)

        cur_acc = metrics.accuracy_score(y_pred=predicted_dev, y_true=y_dev)
        cur_train_acc = metrics.accuracy_score(y_pred=predicted_train, y_true=y_train)
        cur_test_acc = metrics.accuracy_score(y_pred=predicted_test, y_true=y_test)

        if clf_name == 'SVM':
            h_param_results.append([hyper_params['gamma'], hyper_params['C'], cur_acc, cur_train_acc, cur_test_acc])
        else:
            h_param_results.append([hyper_params['max_depth'], cur_acc, cur_train_acc, cur_test_acc])
            
        if cur_acc > best_accuracy:
            best_accuracy = cur_acc
            best_dev_accuracy = cur_acc
            best_train_accuracy = cur_train_acc
            best_test_accuracy = cur_test_acc
            best_model = clf
            best_h_params = cur_h_params
            pickle.dump(best_model, open('best_model_'+clf_name+'.sav', 'wb'))

    if clf_name == 'SVM':
        df_h_param_results = pd.DataFrame(h_param_results, columns =['Gamma', 'C', 'Dev_Accuracy', 'Train_Accuracy', 'Test_Accuracy'])
    else:
        df_h_param_results = pd.DataFrame(h_param_results, columns =['Max_Depth', 'Dev_Accuracy', 'Train_Accuracy', 'Test_Accuracy'])
    
    print(f"\n===================Hyperparameter and Results for {clf_name} for iteration {str(iteraction)}=======================")
    print(df_h_param_results.head(10))

    predicted = best_model.predict(X_test)

    """
    print("\n")    
    print("\n================================Report=====================================================================")
    print(
        f"Classification report for {clf_name} classifier {best_model}:\n"
        f"{metrics.classification_report(y_test, predicted)}\n")
    """
    
    print(f"\n===============Best hyperparameters and Accuracy for {clf_name} for iteration {str(iteraction)}====================")
    if clf_name == 'SVM':
        print("Best C:\t\t\t\t\t" + str(best_h_params['C']) + " and Gamma:" + str(best_h_params['gamma']) )
    else:
        print("Best Max_Depth:\t\t\t\t" + str(best_h_params['max_depth']))
        
    print("Best Dev Accuracy:\t\t\t" + str(best_dev_accuracy))
    print("Best Train Accuracy:\t\t\t" + str(best_train_accuracy))
    print("Best Test Accuracy:\t\t\t" + str(best_test_accuracy))

    no_of_correct_pred = len([k for k, (a, b) in enumerate(zip(y_test, predicted)) if a == b])
    print("Best No of Correct Prediction:\t\t" + str(no_of_correct_pred))
    print("\n") 
    
    return best_test_accuracy, no_of_correct_pred


In [3]:
t = 5
SVM_accu = []
DT_accu = []
SVM_correct_count = []
DT_correct_count = []

In [4]:
for i in range(0,t):
    acc, cc = run_with_hyperparameters(digits.images, digits.target, 'SVM', i)
    SVM_accu.append(acc)
    SVM_correct_count.append(cc)


    Gamma    C  Dev_Accuracy  Train_Accuracy  Test_Accuracy
0  0.0200  0.1      0.077348        0.105776       0.089385
1  0.0070  0.3      0.342541        0.639527       0.307263
2  0.0030  0.8      0.977901        1.000000       0.983240
3  0.0009  0.7      0.983425        0.999304       0.988827
4  0.0001  2.0      0.972376        0.990953       0.977654
5  0.0006  0.4      0.983425        0.994433       0.977654

Best C:					0.7 and Gamma:0.0009
Best Dev Accuracy:			0.9834254143646409
Best Train Accuracy:			0.9993041057759221
Best Test Accuracy:			0.9888268156424581
Best No of Correct Prediction:		177



    Gamma    C  Dev_Accuracy  Train_Accuracy  Test_Accuracy
0  0.0200  0.1      0.082873        0.106472       0.078212
1  0.0070  0.3      0.320442        0.642310       0.273743
2  0.0030  0.8      1.000000        1.000000       0.983240
3  0.0009  0.7      0.994475        0.997912       1.000000
4  0.0001  2.0      0.972376        0.986778       0.988827
5  0.0006  0.4      0.9

In [56]:
for i in range(0,t):
    acc, cc = run_with_hyperparameters(digits.images, digits.target, 'Decision Tree', i)
    DT_accu.append(acc)
    DT_correct_count.append(cc)


   Max_Depth  Dev_Accuracy  Train_Accuracy  Test_Accuracy
0        NaN      0.773481        1.000000       0.832402
1        3.0      0.425414        0.500348       0.446927
2        4.0      0.580110        0.601253       0.508380
3        5.0      0.657459        0.705637       0.648045
4        6.0      0.740331        0.818372       0.821229
5        8.0      0.795580        0.937370       0.849162

Best Max_Depth:				8
Best Dev Accuracy:			0.7955801104972375
Best Train Accuracy:			0.9373695198329853
Best Test Accuracy:			0.8491620111731844
Best No of Correct Prediction:		152



   Max_Depth  Dev_Accuracy  Train_Accuracy  Test_Accuracy
0        NaN      0.845304        1.000000       0.860335
1        3.0      0.436464        0.495477       0.491620
2        4.0      0.519337        0.594990       0.620112
3        5.0      0.646409        0.702853       0.698324
4        6.0      0.707182        0.819763       0.776536
5        8.0      0.801105        0.929715       0.832402

Be

In [57]:
print("\n================================SVM vs Decision Tree Accuracy comparision===================================")
print("\nRun\t\t\tSVM\t\t\tDecision Tree")
for i in range(0,t):
    print(f"\n{str(i)}:\t\t\t{str(round(SVM_accu[i],4))}\t\t\t{str(round(DT_accu[i],4))}")
    
print(f"\nMean:\t\t\t{str(round(np.mean(SVM_accu),4))}\t\t\t{str(round(np.mean(DT_accu),4))}")
                      
print(f"\nSD:\t\t\t{str(round(np.std(SVM_accu),4))}\t\t\t{str(round(np.std(DT_accu),4))}")




Run			SVM			Decision Tree

0:			0.9888			0.8492

1:			0.9944			0.8603

2:			0.9944			0.8603

3:			0.9944			0.8324

4:			0.9832			0.8212

Mean:			0.9911			0.8447

SD:			0.0045			0.0156


In [58]:
print("\n")
print("\n======================SVM vs Decision Tree Number of currect prediction comparision========================")
print("\nRun\t\t\tSVM\t\t\tDecision Tree")
for i in range(0,t):
    print(f"\n{str(i)}:\t\t\t{str(round(SVM_correct_count[i],4))}\t\t\t{str(round(DT_correct_count[i],4))}")
    
print(f"\nMean:\t\t\t{str(round(np.mean(SVM_correct_count),4))}\t\t\t{str(round(np.mean(DT_correct_count),4))}")
                      
print(f"\nSD:\t\t\t{str(round(np.std(SVM_correct_count),4))}\t\t\t{str(round(np.std(DT_correct_count),4))}")





Run			SVM			Decision Tree

0:			177			152

1:			178			154

2:			178			154

3:			178			149

4:			176			147

Mean:			177.4			151.2

SD:			0.8			2.7857
