# Carbon Footprint Eval

In [43]:
import pandas as pd
import gzip
import json
from tqdm import tqdm
from statistics import mean

RETRIEVAL_PARADIGMS = {
    'multi-qa-distilbert-cos-v1': 'Bi-Encoder',
    'DPH': 'Lexical',
    'multi-qa-minilm-l6-cos-v1': 'Bi-Encoder',
    'IFB2': 'Lexical',
    'InB2': 'Lexical',
    'msmarco-minilm-l12-cos-v5': 'Bi-Encoder',
    'PL2': 'Lexical',
    'BM25': 'Lexical',
    'hltcoe-plaidx-large-eng-tdist-mt5xxl-engeng': 'Late Interaction',
    'DirichletLM': 'Lexical',
    'colbert-ir-colbertv2.0': 'Late Interaction',
    'multi-qa-mpnet-base-cos-v1': 'Bi-Encoder',
    'sentence-transformers-msmarco-roberta-base-ance-firstp': 'Bi-Encoder',
    'TF_IDF': 'Lexical',
    'DFIZ': 'Lexical',
    'msmarco-minilm-l6-cos-v5': 'Bi-Encoder',
    'DLH': 'Lexical',
    'msmarco-distilbert-base-tas-b': 'Bi-Encoder',
    'colbert-ir-colbertv1.9': 'Late Interaction',
    'Hiemstra_LM': 'Lexical',
    'msmarco-distilbert-base-v3': 'Bi-Encoder',
}

def dataset_id_to_corpus_sizes():
    ret = {}
    for dataset in tqdm(["msmarco-passage/trec-dl-2019/judged", "msmarco-passage/trec-dl-2020/judged", "disks45/nocr/trec-robust-2004", "clueweb09/en/trec-web-2012", "clueweb12/trec-web-2014"]):
        ret[dataset] = {}
        with gzip.open(f'../data/processed/sampled-corpora/{dataset.replace("/", "-")}.json.gz') as f:
            f = json.load(f)
            for group in f.keys():
                for sampling in f[group].keys():
                    if sampling not in ret[dataset]:
                        ret[dataset][sampling] = set()

                    ret[dataset][sampling].update(f[group][sampling])
            
            ret[dataset] = {k: len(v) for k, v in ret[dataset].items()}

    return ret

def to_rows(df):
    ret = []
    for sampling in df.keys():
        row = {'Sampling': sampling}
        for paradigm in df[sampling].keys():
            for dataset in df[sampling][paradigm].keys():
                row[f'{paradigm} ({dataset})'] = mean(df[sampling][paradigm][dataset])
        ret += [row]
    return pd.DataFrame(ret)

def parse_data(file_name, target_maeasure):
    raw_data = pd.read_json(file_name, lines=True)
    df = {}
    for _, i in raw_data.iterrows():
        if i['sampling'] not in df:
            df[i['sampling']] = {}
        paradigm = RETRIEVAL_PARADIGMS[i['approach']]
        if paradigm not in df[i['sampling']]:
            df[i['sampling']][paradigm] = {}
        dataset = i['dataset'].split('/')[0]

        if dataset not in df[i['sampling']][paradigm]:
            df[i['sampling']][paradigm][dataset] = [target_maeasure(i)]

    return df


In [45]:
df_recall = parse_data('../data/processed/carbon-footprints/aggregated.jsonl', lambda i: i['Recall@10'])
df_recall = to_rows(df_recall)
df_recall

Unnamed: 0,Sampling,Bi-Encoder (disks45),Bi-Encoder (msmarco-passage),Lexical (disks45),Lexical (msmarco-passage),Late Interaction (disks45),Late Interaction (msmarco-passage)
0,loft-10000,0.3872,0.730233,0.490133,0.588372,0.4184,0.653488
1,top-50-run-pool,0.8748,0.972093,0.9628,0.897674,0.7292,0.746512
2,top-100-run-pool,0.9304,0.97907,0.9724,0.909302,0.7384,0.781395
3,re-rank-top-1000-bm25,0.9124,0.834884,0.9632,0.853488,0.7424,0.7
4,top-1000-run-pool,1.0,1.0,1.0,1.0,1.0,1.0
5,top-25-run-pool,0.7972,0.94186,0.9524,0.865116,0.7052,0.737209
6,loft-1000,0.0988,0.688372,0.129733,0.539535,0.104,0.616279
7,top-10-run-pool,0.6696,0.846512,0.9364,0.816279,0.6504,0.7


In [40]:
dataset_sizes = dataset_id_to_corpus_sizes()

100%|██████████| 5/5 [00:28<00:00,  5.65s/it]


In [50]:
df_emissions = parse_data('../data/processed/carbon-footprints/aggregated.jsonl', lambda i: {'Emissions': i['aggregated']['emissions'], 'Size': dataset_sizes[i['dataset']][i['sampling']]})

In [52]:
df_emissions

{'loft-10000': {'Bi-Encoder': {'disks45': [{'Emissions': 0.0041158786,
     'Size': 10331}],
   'msmarco-passage': [{'Emissions': 0.0011058507, 'Size': 10088}]},
  'Lexical': {'disks45': [{'Emissions': 0.0037829133000000003, 'Size': 10331}],
   'msmarco-passage': [{'Emissions': 0.0008556722, 'Size': 10088}]},
  'Late Interaction': {'disks45': [{'Emissions': 0.008812472200000001,
     'Size': 10331}],
   'msmarco-passage': [{'Emissions': 0.006925708300000001, 'Size': 10088}]}},
 'top-50-run-pool': {'Bi-Encoder': {'disks45': [{'Emissions': 0.034830328,
     'Size': 98653}],
   'msmarco-passage': [{'Emissions': 0.0053463043, 'Size': 57850}]},
  'Lexical': {'disks45': [{'Emissions': 0.02252416, 'Size': 98653}],
   'msmarco-passage': [{'Emissions': 0.0041211564, 'Size': 57850}]},
  'Late Interaction': {'disks45': [{'Emissions': 0.0573395336, 'Size': 98653}],
   'msmarco-passage': [{'Emissions': 0.029439490000000002, 'Size': 57850}]}},
 'top-100-run-pool': {'Bi-Encoder': {'disks45': [{'Emiss

In [32]:
dataset_sizes = dataset_id_to_corpus_sizes()

100%|██████████| 5/5 [00:29<00:00,  5.89s/it]


In [33]:
dataset_sizes

{'msmarco-passage/trec-dl-2019/judged': {'re-rank-top-1000-bm25': 41881,
  'loft-1000': 1074,
  'loft-10000': 10088,
  'top-10-run-pool': 12162,
  'top-25-run-pool': 29546,
  'top-50-run-pool': 57850,
  'top-100-run-pool': 111412,
  'top-1000-run-pool': 768286,
  'complete-corpus': 768286},
 'msmarco-passage/trec-dl-2020/judged': {'re-rank-top-1000-bm25': 50904,
  'loft-1000': 1092,
  'loft-10000': 10098,
  'top-10-run-pool': 19566,
  'top-25-run-pool': 46942,
  'top-50-run-pool': 90927,
  'top-100-run-pool': 174754,
  'top-1000-run-pool': 1199244,
  'complete-corpus': 1199244},
 'disks45/nocr/trec-robust-2004': {'re-rank-top-1000-bm25': 165165,
  'loft-1000': 1214,
  'loft-10000': 10331,
  'top-10-run-pool': 27239,
  'top-25-run-pool': 57579,
  'top-50-run-pool': 98653,
  'top-100-run-pool': 160825,
  'top-1000-run-pool': 449371,
  'complete-corpus': 449371},
 'clueweb09/en/trec-web-2012': {'top-10-run-pool': 6220,
  'top-25-run-pool': 14657,
  'top-50-run-pool': 27712,
  'top-100-run