In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import sys
import numpy as np
import glob
import torch

sys.path.insert(1,"/home/showalte/research/prob_seq_queries/")
from seq_queries.utils import read_pkl, write_pkl

In [2]:
def get_tau_ab_data(experiment,dataset,h,s,num_mc_samples, max_num_queries,
                   methods = ['beam_search_lower_bound','importance_sampling']):
    data_dict = {}
    for method in methods:
        root = f"../data/query_4/{dataset}/{experiment}/"
        template_file=(f"{experiment.replace('_','-')}_{dataset.replace('_','-')}_"
                + f"query-4_{f'{h}h' if method != 'entropy_ablation' else '*'}_{s}s*"
                + f"{f'_{num_mc_samples}mc' if max_num_queries else '_'}"
                + f"{'_lb' if method == 'beam_search_lower_bound' else '_imp'}"
                + f"{f'_{max_num_queries}q' if max_num_queries else ''}*.pkl") 
        template_path = os.path.join(root,template_file)
        print(template_path)
        paths = glob.glob(template_path)
        assert len(paths) == 1,f"Too many or too few paths: {paths}"
        data_dict[method] = read_pkl(paths[0])
    
    return data_dict

        

In [72]:
def flatten_tau_ab_data(data_dict, method,dataset, h,s,max_num_queries,num_mc_samples):
    df_list = []
    for method,data in data_dict.items():
        if method == "beam_search_lower_bound":
            intermediate_query_probs = data_dict[method]['intermediate_lbs']
        elif method == "importance_sampling":
            intermediate_query_probs = data_dict[method]['intermediate_query_probs']
        intermediate_query_probs = torch.cumsum(
            intermediate_query_probs,
            dim=1)
        # print(intermediate_query_probs.shape)
        tau_a_raw = torch.gather(
            intermediate_query_probs,-1,
            torch.LongTensor([[data_dict[method]['metadata']['tau_a_excl_terms']]\
                         *intermediate_query_probs.shape[1]]
                         *intermediate_query_probs.shape[0]
                        )).sum(dim=-1)
        tau_b_raw = torch.gather(
            intermediate_query_probs,-1,
            torch.LongTensor([[data_dict[method]['metadata']['tau_b_excl_terms']]\
                         *intermediate_query_probs.shape[1]]
                         *intermediate_query_probs.shape[0]
                        )).sum(dim=-1)
        # print(tau_b_raw.max(), tau_a_raw.max())
        tau_a = pd.DataFrame(tau_a_raw)
        tau_a['sequence_id'] = range(tau_a.shape[0])
        tau_b = pd.DataFrame(tau_b_raw)

        df_a = pd.melt(tau_a,id_vars=['sequence_id'],
                       value_vars=[c for c in tau_a.columns if c != 'sequence_id'])
        # print(df_a.head())
        df_a.columns = ['sequence_id','k',f'{method}_tau_a']
        df_b = pd.melt(tau_b,
                       value_vars=tau_b.columns)
        df_b.columns = ['k',f'tau_b']
        df_a[f'{method}_tau_b'] = df_b['tau_b']
        df_a['k'] += 1
        # print(df_a.sequence_id.max())
        df_list.append(df_a)
    
    for df in df_list[1:]:
        df.drop('k',inplace = True,axis=1)
    
    final_df = pd.concat(df_list,axis=1)
    # print(final_df.columns)
    
    return final_df
    

In [73]:
def flatten_experiments(experiments, datasets, lengths,model_budget=False,
                       max_num_queries=None,
                       num_mc_samples=None):
    data_list = []
    for experiment in experiments:
        for dataset in datasets:
            for h,s in lengths:
                data = get_tau_ab_data(experiment,dataset,
                                           h,s,max_num_queries=max_num_queries,
                                          num_mc_samples=num_mc_samples)
                df = flatten_tau_ab_data(data,experiment, dataset, 
                                        h,s,max_num_queries=max_num_queries,
                                       num_mc_samples =num_mc_samples)
                    
                data_list.append(df)
                    
    # print(len(data_list))
    data_df = pd.concat(data_list,axis = 0)

    return data_df

In [75]:
max_num_queries = 1000
num_mc_samples = 10000
lens = [(10,20),(10,15),(10,15),(10,15)]
datasets = ['shakespeare','apps','moocs','amazon']

for length,dataset in zip(lens,datasets):
    print(dataset)
    df = flatten_experiments(['val_dl'],[dataset],[length],
                        max_num_queries=max_num_queries,
                        num_mc_samples=num_mc_samples)
    # print(df.beam_search_lower_bound_tau_b.describe())
    # print(df.importance_sampling_tau_b.describe())
    print(df.shape)
    # print(df.head())
    df.to_csv(f'{dataset}_query4-ablation_30k.csv',index=None)
    print(f"No missing values: {(df.isnull().sum()==0).all()}")
    print("====="*5)
    


shakespeare
../data/query_4/shakespeare/val_dl/val-dl_shakespeare_query-4_10h_20s*_10000mc_lb_1000q*.pkl
../data/query_4/shakespeare/val_dl/val-dl_shakespeare_query-4_10h_20s*_10000mc_imp_1000q*.pkl
(31000, 7)
No missing values: True
apps
../data/query_4/apps/val_dl/val-dl_apps_query-4_10h_15s*_10000mc_lb_1000q*.pkl
../data/query_4/apps/val_dl/val-dl_apps_query-4_10h_15s*_10000mc_imp_1000q*.pkl
(31000, 7)
No missing values: True
moocs
../data/query_4/moocs/val_dl/val-dl_moocs_query-4_10h_15s*_10000mc_lb_1000q*.pkl
../data/query_4/moocs/val_dl/val-dl_moocs_query-4_10h_15s*_10000mc_imp_1000q*.pkl
(31000, 7)
No missing values: True
amazon
../data/query_4/amazon/val_dl/val-dl_amazon_query-4_10h_15s*_10000mc_lb_1000q*.pkl
../data/query_4/amazon/val_dl/val-dl_amazon_query-4_10h_15s*_10000mc_imp_1000q*.pkl
(31000, 7)
No missing values: True
