#  Intro to Hardware Security and Trust Final Project

## Import Libraries

In [1]:
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.cluster import KMeans,DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import random
import os
from statistics import mean


## Dataset Splitting Function

In [2]:
xlsx_path = 'C:\\Users\patri\\git\\HSTFinalProject\\Classifier-for-Hardware-Trojan-Detection\\ROFreq\\ROFreq\\Chip'
csv_path ='C:\\Users\patri\\git\HSTFinalProject\Classifier-for-Hardware-Trojan-Detection\ROFreq_C\Chip'

for i in np.arange(1,34):
    data_xls = pd.read_excel('{}{}.xlsx'.format(xlsx_path,i), dtype=str, index_col=None)
    data_xls.to_csv('{}{}.csv'.format(csv_path,i), encoding='utf-8', index=False)

In [3]:
def csv_read(chip_num,type,csv_path):
    data= []
    data_list= []
    label_list = np.zeros(25)
    label_list[0] = 1
    label_list[-1] = 1
    with open('{}{}.csv'.format(csv_path,chip_num)) as csvfile:
        csv_reader = csv.reader(csvfile)
   
        for row in csv_reader:
            # rounding
            row = [round(float(num[0:12]),3) for num in row]     
            data_list.append(row)   

        if type == 'TI':
            data.append(data_list[0])
            data.append(data_list[-1])
            label = [1,1]
        elif type == 'TF':
            data = data_list[1:-1]
            label = label_list[1:-1]
        else:
            data = data_list
            label = label_list 
                  
        
    return data,label

In [4]:
def train_test_split(num_sel,case_sel,csv_path):
    chips_num = np.arange(1,34)
    indices = range(33)
    # randomly select #num chips
    sel = random.sample(indices,num_sel)
    # split tain and test data
    train_chips = chips_num[sel]
    test_chips = np.delete(chips_num,sel)
    print('Training chips num: ',train_chips)
    temp = []

    for i in range(int(num_sel/2)):
        temp.extend(['TF','TI'])
 
    type_sel_1 = random.sample(temp,num_sel)
    type_sel_3 = random.choices(['TI','TF'],k=num_sel)
    
    if case_sel == 1:
        print('Type selection:', type_sel_1)
        print('TI:',type_sel_1.count('TI'))
        print('TF:',type_sel_1.count('TF'))

    else:
        print('Type selection:', type_sel_3)
        print('TI:',type_sel_3.count('TI'))
        print('TF:',type_sel_3.count('TF'))

    cnt = 0
    train_data = []
    train_label = []

    for chip_num in train_chips:
        if case_sel == 1:
            data,label = csv_read(chip_num,type_sel_1[cnt],csv_path)
        elif case_sel ==3:
            data,label = csv_read(chip_num,type_sel_3[cnt],csv_path)
        train_data.extend(data)
        train_label.extend(label)
        cnt += 1

    test_data = []
    test_label = []
    for chip_num in test_chips:
        data,label = csv_read(chip_num,'ALL',csv_path)
        test_data.extend(data)
        test_label.extend(label)
    
    return train_data, train_label, test_data, test_label

##  CASE 1

### Case 1 Datasets

In [5]:
# arg1 => number of training samples
# arg2 => number of case (enter 1 or 3 only)
train24_data, train24_label, test24_data, test24_label = train_test_split(24,1,csv_path)
train12_data, train12_label, test12_data, test12_label = train_test_split(12,1,csv_path)
train6_data, train6_label, test6_data, test6_label = train_test_split(6,1,csv_path)

Training chips num:  [25 28 14 33 30 15  2 32 17 31 10 22 20  5  9  6 11  7  8 23 21 24 29 16]
Type selection: ['TF', 'TI', 'TI', 'TI', 'TF', 'TI', 'TF', 'TI', 'TF', 'TI', 'TF', 'TI', 'TI', 'TI', 'TF', 'TI', 'TF', 'TF', 'TF', 'TI', 'TF', 'TF', 'TF', 'TI']
TI: 12
TF: 12
Training chips num:  [23 27 12  8 17 28  1 33  9 30 22 31]
Type selection: ['TF', 'TI', 'TI', 'TF', 'TF', 'TI', 'TI', 'TI', 'TI', 'TF', 'TF', 'TF']
TI: 6
TF: 6
Training chips num:  [23 18 17 24 28 33]
Type selection: ['TI', 'TF', 'TI', 'TF', 'TI', 'TF']
TI: 3
TF: 3


In [6]:
# 24 Training - Create Dataframe objects and compute correlations
train24_data_df = pd.DataFrame(train24_data)
train24_label_s = pd.Series(train24_label)
train24_data_df.corrwith(train24_label_s).sort_values(ascending=False)

7    0.403913
1    0.402456
3    0.401028
4    0.399484
5    0.398598
6    0.398511
2    0.393455
0    0.384984
dtype: float64

In [7]:
# 12 Training - Create Dataframe objects and compute correlations
train12_data_df = pd.DataFrame(train12_data)
train12_label_s = pd.Series(train12_label)
train12_data_df.corrwith(train12_label_s).sort_values(ascending=False)

3    0.438862
4    0.433498
5    0.427949
2    0.427299
7    0.427131
1    0.427009
6    0.420868
0    0.418129
dtype: float64

In [8]:
# 6 Training - Create Dataframe objects and compute correlations
train6_data_df = pd.DataFrame(train6_data)
train6_label_s = pd.Series(train6_label)
train6_data_df.corrwith(train6_label_s).sort_values(ascending=False)

2    0.514594
4    0.511082
1    0.507874
7    0.502933
5    0.502684
3    0.501474
6    0.497456
0    0.494404
dtype: float64

Correlation between features seems to be about equal for predictions with training and test splits

### Tuning KNeighbors Classifier

In [9]:
# Creating KNeighborsClassifier Pipeline
KneighC = Pipeline([('scaler',StandardScaler()),('knc', KNeighborsClassifier())])

# Finding the best parameters for KNeighborsClassifier

## Create parameter grid for gridsearch algorithm
knc_param_Grid=  { 'knc__n_neighbors': np.arange(2,10,1),
                  'knc__leaf_size': np.arange(2,30,2),
                 }

## Create Grid search
knc_grid_search = GridSearchCV(KneighC,
                               param_grid=knc_param_Grid,
                               scoring='accuracy',
                               refit=True,
                               cv=5,
                               verbose=1)

ii=0
### Fit to training data
KNC_trainscores=[]
KNC_testscores=[]
while ii < 20:

    # Splits for each dataset
    train24_data, train24_label, test24_data, test24_label = train_test_split(24,1,csv_path)
    train12_data, train12_label, test12_data, test12_label = train_test_split(12,1,csv_path)
    train6_data, train6_label, test6_data, test6_label = train_test_split(6,1,csv_path)

    # Create lists of training sets to parse
    training_sets = [train24_data, train12_data, train6_data]
    training_labels = [train24_label, train12_label, train6_label]
    
    # Create lists of test sets to parse
    test_sets = [test24_data, test12_data, test6_data]
    test_labels =[test24_label, test12_label, test6_label]
    
    Sets = ["-----24 Sample Training Set-----","-----12 Sample Training Set-----", "-----6 Sample Training Set-----"]
    t_sets = ["-----24 Sample Test Set-----","-----12 Sample Test Set-----", "-----6 Sample Test Set-----"]
    
    #Iterater
    ii = ii + 1
    
    # Loop responsible for parsing training sets and training labels
    for (a,i,j,m,n) in zip(Sets, training_sets,training_labels, test_sets, test_labels):
        print("\n",a,"\n")
        knc_grid_search.fit(i,j)
        ### Print Best parameters
        print("\n",knc_grid_search.best_params_)
        ## Saving the tuned model
        model = knc_grid_search.best_estimator_
        print("Best Accuracy: ",knc_grid_search.best_score_)
        KNC_trainscores+= [knc_grid_search.best_score_]
        y_pred = model.predict(m)
        KNC_testscores += [accuracy_score(n,y_pred)]

Training chips num:  [17 15 32 13 10 16  7 24  8 18  5 33 26 23  3 19  6 11 14 27 28 31 12  4]
Type selection: ['TI', 'TF', 'TI', 'TI', 'TI', 'TI', 'TF', 'TF', 'TF', 'TF', 'TF', 'TF', 'TF', 'TF', 'TF', 'TI', 'TI', 'TF', 'TF', 'TI', 'TI', 'TI', 'TI', 'TI']
TI: 12
TF: 12
Training chips num:  [ 7 17 32 23 21 25 26  6  2 10 19 12]
Type selection: ['TI', 'TI', 'TF', 'TI', 'TI', 'TF', 'TF', 'TI', 'TF', 'TF', 'TI', 'TF']
TI: 6
TF: 6
Training chips num:  [ 6 13 12  9 22 27]
Type selection: ['TF', 'TI', 'TI', 'TI', 'TF', 'TF']
TI: 3
TF: 3

 -----24 Sample Training Set----- 

Fitting 5 folds for each of 112 candidates, totalling 560 fits

 {'knc__leaf_size': 2, 'knc__n_neighbors': 2}
Best Accuracy:  0.9166666666666667

 -----12 Sample Training Set----- 

Fitting 5 folds for each of 112 candidates, totalling 560 fits

 {'knc__leaf_size': 2, 'knc__n_neighbors': 2}
Best Accuracy:  1.0

 -----6 Sample Training Set----- 

Fitting 5 folds for each of 112 candidates, totalling 560 fits

 {'knc__leaf_si

### Results for KneighborsClassifier

In [10]:
# Slicing training scores into individual lists
KNC_24_train_scores = KNC_trainscores[0::3]
KNC_12_train_scores= KNC_trainscores[1::3]
KNC_6_train_scores = KNC_trainscores[2::3]

# Slicing test scores into individual lists
KNC_24_test_scores = KNC_testscores[0::3]
KNC_12_test_scores = KNC_testscores[1::3]
KNC_6_test_scores = KNC_testscores[2::3]

print("---------- Kneighbors Accuracy Scores----------")

# Accuracy scores for test and training sets
train24_avg = mean(KNC_24_train_scores)
test24_avg = mean(KNC_24_test_scores)
print("TRAIN24 Average Accuracy: ", train24_avg)
print("Test24 Average Accruacy: " , test24_avg)

# Accuracy scores for test and training sets
train12_avg = mean(KNC_12_train_scores)
test12_avg = mean(KNC_12_test_scores)
print("Train12 Average Accuracy: ", train12_avg)
print("Test12 Average Accruacy: " , test12_avg)

# Accuracy Scores for test and training sets
train6_avg = mean(KNC_6_train_scores)
test6_avg = mean(KNC_6_test_scores)
print("TRAIN6 Average Accuracy: ", train6_avg)
print("Test6 Average Accruacy: " , test6_avg)

---------- Kneighbors Accuracy Scores----------
TRAIN24 Average Accuracy:  0.9420000000000001
Test24 Average Accruacy:  0.9279999999999999
Train12 Average Accuracy:  0.9413333333333334
Test12 Average Accruacy:  0.9139047619047619
TRAIN6 Average Accuracy:  0.9686666666666667
Test6 Average Accruacy:  0.9137777777777778


### Tuning Logistic Regression

In [15]:
# Create Logisitic Regression Model
Logreg = Pipeline([('scaler',StandardScaler()),('logreg',LogisticRegression(random_state=42))])
Logreg

# Finding the best parameters for Logistic Regression

## Create Parameter Grid
logreg_param_grid = {'logreg__C': [0.001,0.01,0.1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
                     'logreg__tol': [0.0001, 0.001, 0.01]
                    }

## Create Log Reg Gridsearch
logreg_gridsearch = GridSearchCV(Logreg,
                                 param_grid=logreg_param_grid,
                                 scoring='accuracy',
                                 refit=True,
                                 cv=5,
                                 verbose=1)


ii=0
### Fit to training data
logreg_trainscores=[]
logreg_testscores=[]
while ii < 20:

    # Splits for each dataset
    train24_data, train24_label, test24_data, test24_label = train_test_split(24,1,csv_path)
    train12_data, train12_label, test12_data, test12_label = train_test_split(12,1,csv_path)
    train6_data, train6_label, test6_data, test6_label = train_test_split(6,1,csv_path)

    # Create lists of training sets to parse
    training_sets = [train24_data, train12_data, train6_data]
    training_labels = [train24_label, train12_label, train6_label]
    
    # Create lists of test sets to parse
    test_sets = [test24_data, test12_data, test6_data]
    test_labels =[test24_label, test12_label, test6_label]
    
    Sets = ["-----24 Sample Training Set-----","-----12 Sample Training Set-----", "-----6 Sample Training Set-----"]
    t_sets = ["-----24 Sample Test Set-----","-----12 Sample Test Set-----", "-----6 Sample Test Set-----"]
    
    #Iterater
    ii = ii + 1
    
    # Loop responsible for parsing training sets and training labels
    for (a,i,j,m,n) in zip(Sets, training_sets,training_labels, test_sets, test_labels):
        print("\n",a,"\n")
        logreg_gridsearch.fit(i,j)
        ### Print Best parameters
        print("\n",logreg_gridsearch.best_params_)
        ## Saving the tuned model
        model = logreg_gridsearch.best_estimator_
        print("Best Accuracy: ",logreg_gridsearch.best_score_)
        logreg_trainscores+= [logreg_gridsearch.best_score_]
        y_pred = model.predict(m)
        logreg_testscores += [accuracy_score(n,y_pred)]

Training chips num:  [22 28 30 31 11 25 12  6  7  5  2 10 27 33  8  3 29 24 26 18 19 17 15  1]
Type selection: ['TF', 'TI', 'TI', 'TI', 'TF', 'TI', 'TF', 'TF', 'TF', 'TI', 'TF', 'TI', 'TF', 'TI', 'TI', 'TF', 'TI', 'TF', 'TF', 'TI', 'TI', 'TF', 'TF', 'TI']
TI: 12
TF: 12
Training chips num:  [10  3  1 21 15  7 23 31 14 24  8 33]
Type selection: ['TF', 'TF', 'TF', 'TI', 'TI', 'TI', 'TI', 'TI', 'TF', 'TI', 'TF', 'TF']
TI: 6
TF: 6
Training chips num:  [ 4 20 10 23 27 11]
Type selection: ['TF', 'TF', 'TI', 'TI', 'TI', 'TF']
TI: 3
TF: 3

 -----24 Sample Training Set----- 

Fitting 5 folds for each of 39 candidates, totalling 195 fits

 {'logreg__C': 0.1, 'logreg__tol': 0.0001}
Best Accuracy:  0.9233333333333335

 -----12 Sample Training Set----- 

Fitting 5 folds for each of 39 candidates, totalling 195 fits

 {'logreg__C': 0.001, 'logreg__tol': 0.0001}
Best Accuracy:  0.9199999999999999

 -----6 Sample Training Set----- 

Fitting 5 folds for each of 39 candidates, totalling 195 fits

 {'logr

In [16]:
logreg_gridsearch.best_score_

0.9733333333333334

### Log Reg Results

In [17]:
# Slicing training scores into individual lists
logreg_24_train_scores = logreg_trainscores[0::3]
logreg_12_train_scores= logreg_trainscores[1::3]
logreg_6_train_scores = logreg_trainscores[2::3]

# Slicing test scores into individual lists
logreg_24_test_scores = logreg_testscores[0::3]
logreg_12_test_scores = logreg_testscores[1::3]
logreg_6_test_scores = logreg_testscores[2::3]

print("---------- Logistic Regression Accuracy Scores----------")

# Accuracy scores for test and training sets
train24_avg = mean(logreg_24_train_scores)
test24_avg = mean(logreg_24_test_scores)
print("TRAIN24 Average Accuracy: ", train24_avg)
print("Test24 Average Accruacy: " , test24_avg)

# Accuracy scores for test and training sets
train12_avg = mean(logreg_12_train_scores)
test12_avg = mean(logreg_12_test_scores)
print("Train12 Average Accuracy: ", train12_avg)
print("Test12 Average Accruacy: " , test12_avg)

# Accuracy Scores for test and training sets
train6_avg = mean(logreg_6_train_scores)
test6_avg = mean(logreg_6_test_scores)
print("TRAIN6 Average Accuracy: ", train6_avg)
print("Test6 Average Accruacy: " , test6_avg)

---------- Logistic Regression Accuracy Scores----------
TRAIN24 Average Accuracy:  0.9301666666666666
Test24 Average Accruacy:  0.9133333333333333
Train12 Average Accuracy:  0.937
Test12 Average Accruacy:  0.9124761904761906
TRAIN6 Average Accuracy:  0.9560000000000001
Test6 Average Accruacy:  0.914


## CASE 3

### Case 3 Datasets

### KMeans

In [13]:
# Create Kmeans Model
Kmeans = Pipeline([('scaler', StandardScaler()),('kmeans',KMeans())])
Kmeans

### DBSCAN

In [14]:
# Create DBSCAN Model
dbscan = Pipeline([('scaler',StandardScaler()),('dbscan', DBSCAN())])
dbscan