In [1]:
#Import libraries 
import numpy as np
import math 
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt 
import time
from sklearn.model_selection import train_test_split
%matplotlib inline



#Split data into training set proper, calibration set and test set
data_train_p, data_cal, target_train_p, target_cal = train_test_split(data_train, target_train, random_state=0)

print("Training Proper Size:", len(data_train_p))
print("Calibration Set Size:",len(data_cal))
print("Test Set Size:",len(data_test))

Training Proper Size: 5468
Calibration Set Size: 1823
Test Set Size: 2007


In [None]:
#Load USPS Dataset
import h5py
path="usps.h5"
with h5py.File(path, 'r') as usps_data:
    train = usps_data.get('train')
    x_train = train.get('data')[:]
    target_train = train.get('target')[:]
    test = usps_data.get('test')
    data_test = test.get('data')[:]
    target_test = test.get('target')[:]

In [2]:
def set_variables(data_train,target_train,data_test,target_test,k):
    neighbor_data = NearestNeighbors(n_neighbors=k+1)
    score=np.zeros(len(data_train)+1)
    p_values=np.zeros((len(data_test),len(set(target_train))))
    prediction=np.zeros(len(data_test))
    confidence=np.zeros(len(data_test))
    credibility=np.zeros(len(data_test))
    sum_p=0
    return neighbor_data, score, p_values, prediction, confidence, credibility, sum_p

In [3]:
def calculate_score(extend_train_x,extend_train_y,new_data_train,new_target_train,neighbor_data,n):
    same_sample_1=extend_train_x[extend_train_y==n]
    same_sample_2=new_data_train[new_target_train==n]
    neighbor_data.fit(same_sample_1)
    same_scores=neighbor_data.kneighbors(same_sample_2)[0][:,1]
    diff_sample_1=extend_train_x[extend_train_y!=n]
    neighbor_data.fit(diff_sample_1)
    diff_scores=neighbor_data.kneighbors(same_sample_2)[0][:,0]
    result_score=same_scores/diff_scores
    return result_score,neighbor_data

# 1NN Transductive Point Prediction Algorithm

In [4]:
 def knn_conformal_predictor(data_train,target_train,data_test,target_test,k):
    #Set variables
    neighbor_data, score, p_values, prediction, confidence, credibility, sum_p = \
    set_variables(data_train,target_train,data_test,target_test,k)
    #Loop through test samples
    for i in range(len(data_test)):
        #Create new dataset which is the training set + the test sample
        extend_train_x=np.row_stack((data_train,data_test[i]))
        for j in set(target_train): #Test all possible labels
            #Extend labels with test sample
            extend_train_y=np.append(target_train,j)
            conf_scores=[]
            new_data_train=extend_train_x[:len(extend_train_x)-1]
            new_target_train=extend_train_y[:len(extend_train_y)-1]
            for n in range(10):
                result_score,neighbor_data=\
                calculate_score(extend_train_x,extend_train_y,new_data_train,new_target_train,neighbor_data,n)
                conf_scores.extend(result_score)
            data_sample=extend_train_x[-1]
            target_sample=extend_train_y[-1]
            neighbor_data.fit(extend_train_x[extend_train_y==target_sample])
            NN_dist_s=neighbor_data.kneighbors([data_sample])[0][0][1]
            neighbor_data.fit(extend_train_x[extend_train_y!=target_sample])
            NN_dist_d=np.sum(neighbor_data.kneighbors([data_sample])[0][0][0])
            conf_scores.append(NN_dist_s/NN_dist_d)
            #Calculate p-values of test sample
            p_values[i][j]=np.mean(conf_scores>=conf_scores[-1])
        #Use p-values of test sample to calculate various measures
        prediction[i]=int(np.argmax(p_values[i]))
        confidence[i]=1- p_values[i][np.argsort(p_values[i])[-2]]
        credibility[i]=np.max(p_values[i])
        sum_p = sum_p + np.sum(p_values[i]) - p_values[i][target_test[i]]
    false_p_value=sum_p/(2*len(data_test))
    return prediction, confidence, credibility, false_p_value

In [5]:
start_time = time.time()
prediction, confidence, credibility, false_p_value = \
knn_conformal_predictor(data_train,target_train,data_test[0:10],target_test[0:10],1 )
print("--- %s seconds ---" % (time.time() - start_time))

--- 135.52344703674316 seconds ---


In [6]:
print("Linear SVM Accuracy: ", np.mean(prediction==target_test[0:10]))
print("Linear SVM Avg False p-value: ", false_p_value)
print("Linear Avg Confidence: ",np.mean(confidence))
print("Linear Avg Credibility: ",np.mean(credibility))

Linear SVM Accuracy:  1.0
Linear SVM Avg False p-value:  0.003065002742731776
Linear Avg Confidence:  0.9981486560614371
Linear Avg Credibility:  0.5711327482172244
