In [1]:
from dotenv import load_dotenv
from utils import (
    ClientManager,
    VoyageAPIModel,
    OpenAIAPIModel,
    get_logger,
    run_law_benchmarks,
    run_cross_lingual_benchmarks,
    get_vanilla_results_df,
    get_cross_lingual_results_df,
    get_weighted_average_df,
)

load_dotenv()
logger = get_logger()
client_manager = ClientManager()

MODELS = [
    VoyageAPIModel(
        model_name="voyage-law-2", client=client_manager.get_voyage_client()
    ),
    OpenAIAPIModel(
        model_name="text-embedding-ada-002", client=client_manager.get_oai_client()
    ),
    OpenAIAPIModel(
        model_name="text-embedding-3-large", client=client_manager.get_oai_client()
    ),
    OpenAIAPIModel(
        model_name="text-embedding-3-small", client=client_manager.get_oai_client()
    ),
]

In [None]:
for model in MODELS:
    run_law_benchmarks(model, logger)
    run_cross_lingual_benchmarks(model, logger)

In [3]:
get_vanilla_results_df(["ndcg@10"])

Unnamed: 0_level_0,ndcg@10,ndcg@10,ndcg@10,ndcg@10
mteb_dataset_name,LegalBenchConsumerContractsQA,LegalBenchCorporateLobbying,LegalSummarization,Average
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
text-embedding-3-large,0.79373,0.95082,0.71592,0.820157
text-embedding-3-small,0.78917,0.9424,0.6954,0.80899
text-embedding-ada-002,0.79062,0.93108,0.68852,0.803407
voyage-law-2,0.83216,0.9533,0.67238,0.81928
Average,0.80142,0.9444,0.693055,0.812958


In [4]:
get_vanilla_results_df(["evaluation_time"])

Unnamed: 0_level_0,evaluation_time,evaluation_time,evaluation_time,evaluation_time
mteb_dataset_name,LegalBenchConsumerContractsQA,LegalBenchCorporateLobbying,LegalSummarization,Average
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
text-embedding-3-large,286.66,363.56,380.13,343.45
text-embedding-3-small,214.5,257.3,283.68,251.826667
text-embedding-ada-002,180.44,311.46,294.61,262.17
voyage-law-2,16.33,44.93,67.46,42.906667
Average,174.4825,244.3125,256.47,225.088333


In [5]:
get_cross_lingual_results_df(["f1"])

Unnamed: 0_level_0,f1,f1,f1,f1,f1,f1,f1,f1,f1,f1,f1
mteb_dataset_name,Tatoeba_dan-eng,Tatoeba_deu-eng,Tatoeba_fin-eng,Tatoeba_fra-eng,Tatoeba_nld-eng,Tatoeba_nno-eng,Tatoeba_pol-eng,Tatoeba_por-eng,Tatoeba_spa-eng,Tatoeba_swe-eng,Average
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
text-embedding-3-large,0.964233,0.993733,0.977667,0.96,0.972667,0.938248,0.977333,0.9475,0.980333,0.96,0.967172
text-embedding-3-small,0.927067,0.986,0.890833,0.936167,0.963,0.863096,0.949333,0.9369,0.9704,0.919833,0.934263
text-embedding-ada-002,0.944833,0.985556,0.896115,0.937222,0.9565,0.862659,0.949333,0.934733,0.969667,0.919456,0.935607
voyage-law-2,0.891533,0.974833,0.640928,0.922567,0.926567,0.736042,0.9255,0.915067,0.9594,0.879733,0.877217
Average,0.931917,0.985031,0.851386,0.938989,0.954683,0.850011,0.950375,0.93355,0.96995,0.919756,0.928565


In [6]:
get_cross_lingual_results_df(["evaluation_time"])

Unnamed: 0_level_0,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time,evaluation_time
mteb_dataset_name,Tatoeba_dan-eng,Tatoeba_deu-eng,Tatoeba_fin-eng,Tatoeba_fra-eng,Tatoeba_nld-eng,Tatoeba_nno-eng,Tatoeba_pol-eng,Tatoeba_por-eng,Tatoeba_spa-eng,Tatoeba_swe-eng,Average
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
text-embedding-3-large,874.71,854.36,882.07,891.3,912.28,915.23,939.74,880.03,855.18,850.61,885.551
text-embedding-3-small,662.25,798.42,752.1,814.64,759.55,811.84,803.95,822.05,633.31,633.71,749.182
text-embedding-ada-002,629.47,602.09,604.68,623.0,571.83,637.13,622.72,588.23,598.42,589.17,606.674
voyage-law-2,206.83,157.72,239.04,150.45,151.46,145.97,154.68,141.94,193.06,175.19,171.634
Average,593.315,603.1475,619.4725,619.8475,598.78,627.5425,630.2725,608.0625,569.9925,562.17,603.26025


In [7]:
get_weighted_average_df(a=1)

mteb_dataset_name,Tatoeba_swe-eng,Tatoeba_spa-eng,Tatoeba_dan-eng,Tatoeba_fin-eng,Tatoeba_deu-eng,Tatoeba_fra-eng,Tatoeba_por-eng,Tatoeba_pol-eng,Tatoeba_nld-eng,Tatoeba_nno-eng,Average,W_Average
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
text-embedding-3-large,0.96,0.980333,0.964233,0.977667,0.993733,0.96,0.9475,0.977333,0.972667,0.938248,0.967172,0.969973
text-embedding-3-small,0.919833,0.9704,0.927067,0.890833,0.986,0.936167,0.9369,0.949333,0.963,0.863096,0.934263,0.937561
text-embedding-ada-002,0.919456,0.969667,0.944833,0.896115,0.985556,0.937222,0.934733,0.949333,0.9565,0.862659,0.935607,0.940274
voyage-law-2,0.879733,0.9594,0.891533,0.640928,0.974833,0.922567,0.915067,0.9255,0.926567,0.736042,0.877217,0.882518
Average,0.919756,0.96995,0.931917,0.851386,0.985031,0.938989,0.93355,0.950375,0.954683,0.850011,0.928565,0.932581
