# Load & Process the data

### Load in dataset

In [None]:
def load_data(file_path):
    data = {}
    all_scores = []
    with open(file_path) as f:
        for idx, line in enumerate(f):
            line_data = line.strip().split(',') 
            query_id = line_data[0]
            doc_id = int(line_data[1])
            label = int(line_data[2])
            score = float(line_data[3])
            all_scores.append(score)
            doc_data = (doc_id, label, score)
    
            if query_id not in data.keys():
                data[query_id] = list()
            data[query_id].append(doc_data)

    # sort the list so that they arrange in the descending order of their scores
    for query_id in data.keys():
        data[query_id] = sorted(data[query_id], key=lambda x: x[2], reverse=True)

    return data, all_scores

# l1_data, all_scores = load_data(l1_file_name)

In [None]:
l1_file_name = "/Volumes/Seagate/Data/MSMARCO/QuestionAnswering/dev_l1.txt"
l1_data, all_scores = load_data(l1_file_name)

l2_file_name = "/Volumes/Seagate/Data/MSMARCO/QuestionAnswering/dev_l2.txt"
l2_data, _ = load_data(l2_file_name)

### Convert score to [0, 1]

In [None]:
total_score_sum = 0 
for i in all_scores:
    total_score_sum += i
average_score = total_score_sum / len(all_scores)

import math
for query_id in l1_data.keys():
    l1_data[query_id] = [(d[0], d[1], math.tanh(d[2]/average_score)) for d in l1_data[query_id]]
    
for query_id in l2_data.keys():
    l2_data[query_id] = [(d[0], d[1], (d[2]/100)) for d in l2_data[query_id]]

### Split to validation and test

In [None]:
import random

def split_query_ids(data, split_ratio=0.5):
    query_ids = list(data.keys())
    num_queries = len(query_ids)
    num_val = int (num_queries * split_ratio)
    val_idx = set(random.sample(range(num_queries), num_val))
    val_ids = [query_ids[idx] for idx in val_idx]
    test_ids = [query_ids[idx] for idx in range(num_queries) if idx not in val_idx]
    return val_ids, test_ids

val_ids, test_ids = split_query_ids(l1_data, 0.5)

### Zip data

In [None]:
def zip_data(l1_data, l2_data, ids):
    zipped_data = {}
    for id in ids:
        zipped_data[id] = (l1_data[id], l2_data[id])
    return zipped_data
val_zipped_data = zip_data(l1_data, l2_data, val_ids)
test_zipped_data = zip_data(l1_data, l2_data, test_ids)

print(test_zipped_data['0'])

# Conformal Prediction

### loss function

In [None]:
def calc_l1_risk_for_query(docs_for_query, threshold):
    ground_truth_docs = get_ground_truth_above_l(docs_for_query, 1)
    fetched_docs = set([doc[0] for doc in docs_for_query if doc[2] >= threshold])

    num_fetched = len(ground_truth_docs.intersection(fetched_docs))
    loss = 1 - num_fetched / (1.0 if len(ground_truth_docs) == 0 else len(ground_truth_docs))
    return (loss, fetched_docs)

def get_ground_truth_above_l(docs_for_query, relevance_level=1):
    relevant_docs = [doc[0] for doc in docs_for_query if doc[1] >= relevance_level]
    return set(relevant_docs)

get_ground_truth_above_l(l1_data['1'], 1)

### retrieval lambda

In [None]:
def calc_retrieval_lambda(val_data, alpha):
    pre_lambda_val = 0
    lambda_val = 0.5
    delta = abs(pre_lambda_val  - lambda_val) 
    precision = 0.00001
    M = len(val_data.keys())
    threshold = (M + 1) * alpha - 1
    # print(threshold)
    # iteration = 0
    while delta >= precision:
        total_loss = 0 
        for query_id, (docs_for_query, _)  in val_data.items():
            total_loss += calc_l1_risk_for_query(docs_for_query, lambda_val)[0]
        # print(total_loss)
        if total_loss > threshold:
            lambda_val -= delta / 2
        elif total_loss < threshold:
            lambda_val += delta / 2
        else:
            break
        pre_lambda_val = lambda_val
        delta /= 2
    return lambda_val

## Get maximal lambda_K

In [None]:
def calc_l2_risk_for_query(l1_retrieved_docs, l2_ground_truth):
    denominator = sum([1.0/math.log2(i+2) for i in range(len(l2_ground_truth))])
    common_docs = set(l1_retrieved_docs).intersection(set(l2_ground_truth))
    if len(common_docs) == 0:
        return 1
    else:
        nominator = sum([1.0/math.log2(i+2) for i in range(len(common_docs))])
    return 1 - nominator / denominator

In [None]:
def get_max_retrieval_lambda(val_data, beta, level = 1):
    pre_lambda_val = 0
    lambda_val = 0.5
    delta = abs(pre_lambda_val  - lambda_val) 
    precision = 0.0005
    M = len(val_data.keys())
    threshold = (M + 1) * beta - 1
    while delta >= precision:
        total_l2_loss = 0 
        for query_id, (docs_for_query, _) in val_data.items():
            l1_fetched_docs = set([doc[0] for doc in docs_for_query if doc[2] >= lambda_val])
            l2_ground_truth_docs = set([doc[0] for doc in docs_for_query if doc[1] >= level])
            min_l2_risk_for_query = calc_l2_risk_for_query(l1_fetched_docs, l2_ground_truth_docs)
            total_l2_loss += min_l2_risk_for_query

        if total_l2_loss > threshold:
            lambda_val -= delta / 2
        elif total_l2_loss < threshold:
            lambda_val += delta / 2
        else:
            break
        pre_lambda_val = lambda_val
        delta /= 2
        
    # calc l1 risk
    total_l1_loss = 0
    for query_id, (docs_for_query, _) in val_data.items():
        l1_risk_for_query, _ = calc_l1_risk_for_query(docs_for_query, lambda_val)
        total_l1_loss += l1_risk_for_query
    alpha = (total_l1_loss + 1)/(M + 1)
    
    return lambda_val, alpha

max_l1_lambda, alpha = get_max_retrieval_lambda(val_zipped_data, 0.3)
print(max_l1_lambda)
print(alpha)

## Change beta but fix alpha

In [None]:
import numpy as np

iteration_times = 10
precision = 0.0005
num_lambda_steps = 21

alpha = 0.3

summary_by_beta = {}

for beta in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]:
    for iteration in range(iteration_times):     
        val_ids, test_ids = split_query_ids(l1_data, 0.5)
        val_zipped_data = zip_data(l1_data, l2_data, val_ids)
        test_zipped_data = zip_data(l1_data, l2_data, test_ids)


        max_l1_lambda_1 = calc_retrieval_lambda(val_zipped_data, alpha)
        (max_l1_lambda_2,_) = get_max_retrieval_lambda(val_zipped_data, beta)
        max_l1_lambda = min(max_l1_lambda_1, max_l1_lambda_2)
        
        lambda_grid = np.linspace(0, max_l1_lambda, num=num_lambda_steps)
        best_prediction_size = 10000000
        best_l1_size, best_l2_size = 0, 0
        best_alpha, best_beta = 0, 0
        for l1_lambda_val in lambda_grid:
            pre_l2_lambda_val = 0
            l2_lambda_val = 0.5
            delta = abs(pre_l2_lambda_val  - l2_lambda_val) 
            M = len(val_zipped_data.keys())
            threshold = (M + 1) * beta - 1
            while delta >= precision:
                total_loss = 0 
                for query_id, (l1_docs_for_query, l2_docs_for_query) in val_zipped_data.items():
                    l1_fetched_docs = set([doc[0] for doc in l1_docs_for_query if doc[2] >= l1_lambda_val])
                    l2_retained_docs = set([doc[0] for doc in l2_docs_for_query if doc[2] >= l2_lambda_val and doc[0] in l1_fetched_docs])
                    l2_ground_truth_docs = set([doc[0] for doc in l2_docs_for_query if doc[1] >= level])
                    l2_risk_for_query = calc_l2_risk_for_query(l2_retained_docs, l2_ground_truth_docs)
                    total_loss += l2_risk_for_query
                    
                if total_loss > threshold:
                    l2_lambda_val -= delta / 2
                elif total_loss < threshold:
                    l2_lambda_val += delta / 2
                else:
                    break
                pre_l2_lambda_val = l2_lambda_val
                delta /= 2

            ## verify control on test data
            total_l1_size, total_l2_size = 0, 0
            total_l1_loss, total_l2_loss = 0, 0
            M_test = len(val_zipped_data.keys())
            for query_id, (l1_docs_for_query, l2_docs_for_query) in test_zipped_data.items():
                l1_fetched_docs = set([doc[0] for doc in l1_docs_for_query if doc[2] >= l1_lambda_val])
                l1_ground_truth = set([doc[0] for doc in l1_docs_for_query if doc[1] >= level])
                total_l1_size += len(l1_fetched_docs)
                total_l1_loss += 1 - len(l1_ground_truth.intersection(l1_fetched_docs)) / (1.0 if len(l1_ground_truth) == 0 else len(l1_ground_truth))
                l2_retained_docs = set([doc[0] for doc in l2_docs_for_query if doc[2] >= l2_lambda_val and doc[0] in l1_fetched_docs])
                total_l2_size += len(l2_retained_docs)
                l2_ground_truth_docs = set([doc[0] for doc in l2_docs_for_query if doc[1] >= level])
                l2_risk_for_query = calc_l2_risk_for_query(l2_retained_docs, l2_ground_truth_docs)
                total_l2_loss += l2_risk_for_query
    
            avg_l1_loss = (total_l1_loss + 1)/(M_test + 1)
            avg_l2_loss = (total_l2_loss + 1)/(M_test + 1)
            avg_l1_size = total_l1_size / M_test
            avg_l2_size = total_l2_size / M_test
            prediction_size = avg_l1_size + avg_l2_size
            if prediction_size < best_prediction_size:
                best_prediction_size = prediction_size
                best_l1_size = avg_l1_size
                best_l2_size = avg_l2_size
                best_alpha = avg_l1_loss
                best_beta = avg_l2_loss
                
        print('{}:{}:{}:{}:{}:{}'.format(beta, best_prediction_size, best_l1_size, best_l2_size, best_alpha, best_beta))
        if beta not in summary_by_beta.keys():
            summary_by_beta[beta] = list()
        summary_by_beta[beta].append((best_prediction_size, best_l1_size, best_l2_size, best_alpha, best_beta))

# print(summary_by_beta[0.1])

In [None]:
for beta, results in summary_by_beta.items():
    iteration_times = len(results)
    total_pred_size, total_l1_size, total_l2_size, total_l1_loss, total_l2_loss = 0, 0, 0, 0, 0
    for tuple in results:
        total_pred_size += tuple[0]
        total_l1_size += tuple[1]
        total_l2_size += tuple[2]
        total_l1_loss += tuple[3]
        total_l2_loss += tuple[4]
    print('{}:{}:{}:{}:{}:{}'.format(beta, total_pred_size / iteration_times, total_l1_size / iteration_times,
                                  total_l2_size / iteration_times, total_l1_loss / iteration_times, total_l2_loss / iteration_times))

## Fixed beta but change alpha

In [None]:
import numpy as np

iteration_times = 100
level = 1
precision = 0.0005
num_lambda_steps = 21

beta = 0.2

summary_by_alpha = {}

for alpha in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    for iteration in range(iteration_times):     
        val_ids, test_ids = split_query_ids(l1_data, 0.5)
        val_zipped_data = zip_data(l1_data, l2_data, val_ids)
        test_zipped_data = zip_data(l1_data, l2_data, test_ids)


        max_l1_lambda_1 = calc_retrieval_lambda(val_zipped_data, alpha)
        (max_l1_lambda_2,_) = get_max_retrieval_lambda(val_zipped_data, beta)
        max_l1_lambda = min(max_l1_lambda_1, max_l1_lambda_2)
        
        # max_l1_lambda, _ = get_max_retrieval_lambda(val_zipped_data, beta, level)
        # print(max_l1_lambda)
        lambda_grid = np.linspace(0, max_l1_lambda, num=num_lambda_steps)
        # print(lambda_grid)
        best_prediction_size = 10000000
        best_l1_size, best_l2_size = 0, 0
        best_alpha, best_beta = 0, 0
        for l1_lambda_val in lambda_grid:
            pre_l2_lambda_val = 0
            l2_lambda_val = 0.5
            delta = abs(pre_l2_lambda_val  - l2_lambda_val) 
            M = len(val_zipped_data.keys())
            threshold = (M + 1) * beta - 1
            while delta >= precision:
                total_loss = 0 
                for query_id, (l1_docs_for_query, l2_docs_for_query) in val_zipped_data.items():
                    l1_fetched_docs = set([doc[0] for doc in l1_docs_for_query if doc[2] >= l1_lambda_val])
                    l2_retained_docs = set([doc[0] for doc in l2_docs_for_query if doc[2] >= l2_lambda_val and doc[0] in l1_fetched_docs])
                    l2_ground_truth_docs = set([doc[0] for doc in l2_docs_for_query if doc[1] >= level])
                    l2_risk_for_query = calc_l2_risk_for_query(l2_retained_docs, l2_ground_truth_docs)
                    total_loss += l2_risk_for_query
                    
                if total_loss > threshold:
                    l2_lambda_val -= delta / 2
                elif total_loss < threshold:
                    l2_lambda_val += delta / 2
                else:
                    break
                pre_l2_lambda_val = l2_lambda_val
                delta /= 2

            ## verify control on test data
            total_l1_size, total_l2_size = 0, 0
            total_l1_loss, total_l2_loss = 0, 0
            M_test = len(val_zipped_data.keys())
            for query_id, (l1_docs_for_query, l2_docs_for_query) in test_zipped_data.items():
                l1_fetched_docs = set([doc[0] for doc in l1_docs_for_query if doc[2] >= l1_lambda_val])
                l1_ground_truth = set([doc[0] for doc in l1_docs_for_query if doc[1] >= level])
                total_l1_size += len(l1_fetched_docs)
                total_l1_loss += 1 - len(l1_ground_truth.intersection(l1_fetched_docs)) / (1.0 if len(l1_ground_truth) == 0 else len(l1_ground_truth))
                l2_retained_docs = set([doc[0] for doc in l2_docs_for_query if doc[2] >= l2_lambda_val and doc[0] in l1_fetched_docs])
                total_l2_size += len(l2_retained_docs)
                l2_ground_truth_docs = set([doc[0] for doc in l2_docs_for_query if doc[1] >= level])
                l2_risk_for_query = calc_l2_risk_for_query(l2_retained_docs, l2_ground_truth_docs)
                total_l2_loss += l2_risk_for_query
    
            avg_l1_loss = (total_l1_loss + 1)/(M_test + 1)
            avg_l2_loss = (total_l2_loss + 1)/(M_test + 1)
            avg_l1_size = total_l1_size / M_test
            avg_l2_size = total_l2_size / M_test
            prediction_size = avg_l1_size + avg_l2_size
            if prediction_size < best_prediction_size:
                best_prediction_size = prediction_size
                best_l1_size = avg_l1_size
                best_l2_size = avg_l2_size
                best_alpha = avg_l1_loss
                best_beta = avg_l2_loss
                
        print('{}:{}:{}:{}:{}:{}'.format(alpha, best_prediction_size, best_l1_size, best_l2_size, best_alpha, best_beta))
        if alpha not in summary_by_alpha.keys():
            summary_by_alpha[alpha] = list()
        summary_by_alpha[alpha].append((best_prediction_size, best_l1_size, best_l2_size, best_alpha, best_beta))

# print(summary_by_alpha[0.1])

In [None]:
for alpha, results in summary_by_alpha.items():
    iteration_times = len(results)
    total_pred_size, total_l1_size, total_l2_size, total_l1_loss, total_l2_loss = 0, 0, 0, 0, 0
    for tuple in results:
        total_pred_size += tuple[0]
        total_l1_size += tuple[1]
        total_l2_size += tuple[2]
        total_l1_loss += tuple[3]
        total_l2_loss += tuple[4]
    print('{}:{}:{}:{}:{}:{}'.format(alpha, total_pred_size / iteration_times, total_l1_size / iteration_times,
                                  total_l2_size / iteration_times, total_l1_loss / iteration_times, total_l2_loss / iteration_times))