This is a modified copy of legalbench/static-eval/static-eval/eval_notebook.ipynb

In [13]:
import math
import openai
import os
import json
import sklearn.metrics
import random
from langchain.llms import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from task_utils import TASKS, load_data, load_prompt, generate_prompts

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain import PromptTemplate, HuggingFaceHub, HuggingFacePipeline, LLMChain

import warnings
warnings.simplefilter('ignore')

from tqdm import tqdm

In [2]:
with open('openai-api-secret-key.txt', 'r') as f:
    openai_api_key = f.read().strip()
os.environ["OPENAI_API_KEY"] = openai_api_key

In [3]:
with open('huggingface-hub-api-token-write.txt', 'r') as f:
    hf_api_key = f.read().strip()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_api_key

In [4]:
def call_llm(prompt, interface):
    output = ''
    if interface == 'openai':
        GPT_TURBO = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5, max_tokens=600)
        output = GPT_TURBO([HumanMessage(content=prompt)]).content
    elif interface == 'huggingface':
        prompt = PromptTemplate.from_template(template=prompt)
        llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":1e-10}, task="text2text-generation")
        llm_chain = LLMChain(
            llm=llm,
            prompt=prompt
        )
        output = llm_chain.run(prompt=prompt)
    
    return output

In [10]:
def evaluate(tasks: list, tasks_dir: str, interface='openai'):
    report = dict()
    for task in tqdm(tasks):
        train_df, test_df = load_data(task=task, tasks_dir=tasks_dir)
        prompt_template = load_prompt(prompt_name="base_prompt.txt", task=task, tasks_dir=tasks_dir)
        prompts = generate_prompts(prompt_template=prompt_template, data_df=train_df)
        report[task] = dict()
        targets = list()
        outputs = list()
        for prompt, data in zip(prompts, train_df.iterrows()):
            datapoint_id, data = data
            output = call_llm(prompt, interface)
            output = output.strip()
            targets.append(data['answer'])
            outputs.append(output)
            success = output == data['answer']
            report[task][datapoint_id] = {
                'prompt': prompt,
                'generated_output': output,
                'correct_output': data['answer'],
                'success': output == data['answer']
            }
        report[task]['balanced_accuracy'] = sklearn.metrics.balanced_accuracy_score(targets, outputs)
    
    print('Balanced Accuracy:', sum([report[task]['balanced_accuracy'] if not math.isnan(report[task]['balanced_accuracy']) else 0 for task in tasks])/len(tasks))
    
    return report


In [11]:
tasks_dir = 'legalbench/static-eval/legalbench/'
cuad_tasks = [task for task in TASKS if task.startswith('cuad')]
report = evaluate(cuad_tasks, tasks_dir, 'huggingface')

100%|███████████████████████████████████████████| 38/38 [03:22<00:00,  5.32s/it]

Balanced Accuracy: 0.5131578947368421





In [14]:
with open('cuad_flant5base_report.json', 'w') as f:
    json.dump(report, f, indent=4)

In [None]:
tasks_dir = 'legalbench/static-eval/legalbench/'

# warnings are to be expected

report = evaluate(tasks=random.sample(TASKS, 10), tasks_dir=tasks_dir)