### Conformalized quantile regression(CQR)

The link to the original paper
http://papers.neurips.cc/paper/8613-conformalized-quantile-regression.pdf

We use the package in the following link for experiment

https://github.com/yromano/cqr

Note that we use this CQR for median, calibration, and 90% interval only. 

We use the model under the random forest specification with symmetric interval and recommended hyperparameter from the package example

In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import numpy as np
np.warnings.filterwarnings('ignore')

import torch

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from cqr import helper
from nonconformist.nc import RegressorNc
from nonconformist.cp import IcpRegressor
from nonconformist.nc import QuantileRegErrFunc

%matplotlib inline

### Simulate data

In [2]:
np.random.seed(2020)

# generate mu, gaussian
mu=np.random.normal(scale=2,size=1000)

# generate sd, uniform
sig=np.random.uniform(0.5,2.5,size=1000)

# the full design matrix
x=np.c_[mu,sig]

y=np.random.normal(loc=mu,scale=sig,size=(1000))


train_x=x[:700,:]
train_y=y[:700]

test_x=x[700:,:]
test_y=y[700:]

# compute input dimensions
n_train=len(train_x)
n_test=len(test_x)
in_shape = x.shape[1]

# display basic information
print("Dimensions: train set (n=%d, p=%d) ; test set (n=%d, p=%d)" % 
      (train_x.shape[0], train_x.shape[1], test_x.shape[0], test_x.shape[1]))



# divide the data into proper training set and calibration set
idx = np.random.permutation(n_train)
n_half = int(np.floor(n_train/2))
idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half]

Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)


### CQR model

In [3]:
#########################################################
# Quantile random forests parameters
# (See QuantileForestRegressorAdapter class in helper.py)
#########################################################

# the number of trees in the forest
n_estimators = 1000

# the minimum number of samples required to be at a leaf node
# (default skgarden's parameter)
min_samples_leaf = 1

# the number of features to consider when looking for the best split
# (default skgarden's parameter)
max_features = x.shape[1]


# use cross-validation to tune the quantile levels?
cv_qforest = True

# when tuning the two QRF quantile levels one may
# ask for a prediction band with smaller average coverage
# to avoid too conservative estimation of the prediction band
# This would be equal to coverage_factor*(quantiles[1] - quantiles[0])
coverage_factor = 0.85

# ratio of held-out data, used in cross-validation
cv_test_ratio = 0.05

# seed for splitting the data in cross-validation.
# Also used as the seed in quantile random forests function
cv_random_state = 1

# determines the lowest and highest quantile level parameters.
# This is used when tuning the quanitle levels by cross-validation.
# The smallest value is equal to quantiles[0] - range_vals.
# Similarly, the largest value is equal to quantiles[1] + range_vals.
cv_range_vals = 30

# sweep over a grid of length num_vals when tuning QRF's quantile parameters                   
cv_num_vals = 10


# define the QRF's parameters 
params_qforest = dict()
params_qforest["n_estimators"] = n_estimators
params_qforest["min_samples_leaf"] = min_samples_leaf
params_qforest["max_features"] = max_features
params_qforest["CV"] = cv_qforest
params_qforest["coverage_factor"] = coverage_factor
params_qforest["test_ratio"] = cv_test_ratio
params_qforest["random_state"] = cv_random_state
params_qforest["range_vals"] = cv_range_vals
params_qforest["num_vals"] = cv_num_vals


### Single Evaluation

In [4]:
def cqr_lu(level):
    
    # desired quanitile levels
    quantiles = [0.5-level/2, 0.5+level/2]
    # target quantile levels
    quantiles_forest = [quantiles[0]*100, quantiles[1]*100]

    # desired miscoverage error
    alpha = 1-level


    # define QRF model
    quantile_estimator = helper.QuantileForestRegressorAdapter(model=None,
                                                           fit_params=None,
                                                           quantiles=quantiles_forest,
                                                           params=params_qforest)
        
    # define the CQR object
    nc = RegressorNc(quantile_estimator, QuantileRegErrFunc())

    # run CQR procedure
    y_lower, y_upper = helper.run_icp(nc, train_x, train_y, test_x, idx_train, idx_cal, alpha)

    return y_lower,y_upper



#### Interval Width

In [None]:
lquantile,rquantile=cqr_lu(0.9)
(np.mean((test_y.ravel()<rquantile.ravel())*(test_y.ravel()>lquantile.ravel())))
plt.figure(figsize=(5,5))
plt.plot(rquantile-lquantile,test_x[:,1]*3.28,'.')
plt.plot([2,9],[2,9])
#np.save('cqr_width',rquantile-lquantile)

### Ten Replications to evaluate the hard metrics

In [5]:
##function to create replication
def rep_iter(x,y,frac=0.3):
    n=len(x)
    ntest=int(np.floor(frac*n))
    allidx=np.random.permutation(n)
    trainidx= allidx[ntest:]
    testidx= allidx[:ntest]
    return x[trainidx],y[trainidx],x[testidx],y[testidx]
    

In [6]:
#initialize the metric
cqrcal=[]
cqr90=[]
cqrmae=[]

In [7]:
np.random.seed(2021)
for a in range(10):
    train_x,train_y,test_x,test_y=rep_iter(x,y)

    # compute input dimensions
    n_train=len(train_x)
    n_test=len(test_x)
    in_shape = train_x.shape[1]

    # display basic information
    print("Dimensions: train set (n=%d, p=%d) ; test set (n=%d, p=%d)" % 
          (train_x.shape[0], train_x.shape[1], test_x.shape[0], test_x.shape[1]))



    # divide the data into proper training set and calibration set
    idx = np.random.permutation(n_train)
    n_half = int(np.floor(n_train/2))
    idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half]

    def cqr_lu(level):

        # desired quanitile levels
        quantiles = [0.5-level/2, 0.5+level/2]
        # target quantile levels
        quantiles_forest = [quantiles[0]*100, quantiles[1]*100]

        # desired miscoverage error
        alpha = 1-level


        # define QRF model
        quantile_estimator = helper.QuantileForestRegressorAdapter(model=None,
                                                               fit_params=None,
                                                               quantiles=quantiles_forest,
                                                               params=params_qforest)

        # define the CQR object
        nc = RegressorNc(quantile_estimator, QuantileRegErrFunc())

        # run CQR procedure
        y_lower, y_upper = helper.run_icp(nc, train_x, train_y, test_x, idx_train, idx_cal, alpha)

        return y_lower,y_upper


    
    
    #####calculate metrics##############

    per=np.linspace(0.02,0.98,8) #quantile to study calibration

    cqrc=[]

    for i in per:
        lquantile,rquantile=cqr_lu(i)
        cqrc.append(np.mean((test_y<rquantile)*(test_y>lquantile)))

    cqrcal.append(np.abs(cqrc-per).mean())
    
    #ninty
    lquantile90,rquantile90=cqr_lu(0.9)
    cqr90.append(np.mean((test_y<rquantile90)*(test_y>lquantile90)))
    
    
    #mae    
    lquantile50,rquantile50=cqr_lu(0.02)
    cqrmae.append(np.abs((lquantile50+rquantile50)/2.-test_x[:,0]).mean())




Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)
Dimensions: train set (n=700, p=2) ; test set (n=300, p=2)


In [11]:
def musd(x):
    print(np.mean(x),np.std(x))

musd(cqrcal)
musd(cqr90)
musd(cqrmae)



0.01936309523809523 0.01021831734454072
0.8960000000000001 0.018726095873584165
0.7282353703063421 0.07346460594281316
