In [1]:
#Import libraries 
import numpy as np
import math 
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt 
import time
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
#Load USPS Dataset
import h5py
path="usps.h5"
with h5py.File(path, 'r') as usps_data:
    train = usps_data.get('train')
    x_train = train.get('data')[:]
    y_train = train.get('target')[:]
    test = usps_data.get('test')
    x_test = test.get('data')[:]
    y_test = test.get('target')[:]

In [3]:
x_train

array([[0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.1645, 0.086 , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ]],
      dtype=float32)

In [4]:
y_train

array([6, 5, 4, ..., 3, 0, 1])

In [5]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [6]:
y_test

array([9, 6, 3, ..., 4, 0, 1])

In [7]:
#Concatenate training and test set
usps_x=np.row_stack((x_train,x_test))
usps_y=np.append(y_train,y_test)
print('usps_x: ' + str(usps_x.shape)) #Total size of the dataset when we concatenate both training and test set
print('usps_y: ' + str(usps_y.shape))

usps_x: (9298, 256)
usps_y: (9298,)


In [8]:
#Initially complete dataset split into 2,the training set size=80% and the rest are test set
x_train, x_test, y_train, y_test = train_test_split(usps_x, usps_y, random_state=0,train_size=0.80)
print('X_train: ' + str(x_train.shape))
print('Y_train: ' + str(y_train.shape))
print('X_test:  '  + str(x_test.shape))
print('Y_test:  '  + str(y_test.shape))
print("Training  Size:", len(x_train))
print("Test dataset Size:",len(x_test))

X_train: (7438, 256)
Y_train: (7438,)
X_test:  (1860, 256)
Y_test:  (1860,)
Training  Size: 7438
Test dataset Size: 1860


In [9]:
def set_variables(x_train,y_train,x_test,y_test,k): #Function to initialze the variables
    nn_model = NearestNeighbors(n_neighbors=k+1) # Initialize nearest neighbour model
    score=np.zeros(len(x_train)+1)
    p_values=np.zeros((len(x_test),len(set(y_train))))
    pred=np.zeros(len(x_test))
    conf=np.zeros(len(x_test))
    cred=np.zeros(len(x_test))
    sum_p=0
    return nn_model, score, p_values, pred, conf, cred, sum_p

In [10]:
def calculate_score(extend_train_x,extend_train_y,new_data_train,new_target_train,nn_model,n): #function to calcuate non conformity score
    same_dist_train=extend_train_x[extend_train_y==n]
    same_dist_cal=new_data_train[new_target_train==n]
    nn_model.fit(same_dist_train)
    ncs_same=nn_model.kneighbors(same_dist_cal)[0][:,1]
    diff_dist_train=extend_train_x[extend_train_y!=n]
    nn_model.fit(diff_dist_train)
    ncs_diff=nn_model.kneighbors(same_dist_cal)[0][:,0]
    result_score=ncs_same/ncs_diff
    return result_score,nn_model

 Implement 1NN Transductive conformal Prediction Algorithm

In [12]:
 def tcp(x_train,y_train,x_test,y_test,k):
    #Set variables
    nn_model, score, p_values, pred, conf, cred, sum_p = \
    set_variables(x_train,y_train,x_test,y_test,k)
    #Loop through test samples
    for i in range(len(x_test)):
        extend_train_x=np.row_stack((x_train,x_test[i])) #Create new dataset which is the training set + the test sample
        for j in set(y_train): #Test all possible labels
            extend_train_y=np.append(y_train,j)#Extend labels with test sample
            conf_scores=[]
            new_x_train=extend_train_x[:len(extend_train_x)-1]
            new_y_train=extend_train_y[:len(extend_train_y)-1]
            for n in range(10):
                # calling Function calculate scores
                result_score,nn_model=\
                calculate_score(extend_train_x,extend_train_y,new_x_train,new_y_train,nn_model,n)
                conf_scores.extend(result_score)
            data_sample=extend_train_x[-1]
            target_sample=extend_train_y[-1]
            nn_model.fit(extend_train_x[extend_train_y==target_sample])# Fit the model
            same_dist_test=nn_model.kneighbors([data_sample])[0][0][1]
            nn_model.fit(extend_train_x[extend_train_y!=target_sample])
            diff_dist_test=np.sum(nn_model.kneighbors([data_sample])[0][0][0])
            conf_scores.append(same_dist_test/diff_dist_test)# cacluate non conformity score of test set
            p_values[i][j]=np.mean(conf_scores>=conf_scores[-1])#Calculate p-values of test sample
        pred[i]=int(np.argmax(p_values[i]))#Use p-values of test sample to calculate various measures calculate prediction
        conf[i]=1- p_values[i][np.argsort(p_values[i])[-2]] #calculate Confidence
        cred[i]=np.max(p_values[i])#calculate credibility
        sum_p = sum_p + np.sum(p_values[i]) - p_values[i][y_test[i]]#calculate sum of p_value
    false_p_value=sum_p/(2*len(x_test))#calculate false p_value
    return pred, conf, cred, false_p_value

In [15]:
start_time = time.time()
pred, conf, cred, false_p_value = \
tcp(x_train,y_train,x_test[0:10],y_test[0:10],1 )
print("Total run time %s seconds " % (time.time() - start_time))


Total run time 3632.5689861774445 seconds 


In [16]:
print(" Accuracy: ", np.mean(pred==y_test[0:10]))
print("Avg False p-value: ", false_p_value)
print("Confidence: ",np.mean(conf))
print("Credibility: ",np.mean(cred))

 Accuracy:  0.9
Avg False p-value:  0.011090200295738683
Confidence:  0.99853474929426
Credibility:  0.4774297620647937
