## Prec & Recall on synth contracts

In [1]:
from llm_programs.utils import load
from pathlib import Path
root_dir = Path("../../../")
results_dir = root_dir / "results/redaction/synth/"
results_file = results_dir / "eval_n10_seed42_gemini.pkl"
results = load(results_file)

In [5]:
for method_key in results:
    for doc_key in results[method_key]:
        naive_precision = results[method_key][doc_key]["naive_precision"]
        naive_recall = results[method_key][doc_key]["naive_recall"]
        red_precision = results[method_key][doc_key]["red_precision"]
        red_recall = results[method_key][doc_key]["red_recall"]
        print(f"Method: {method_key}, Document: {doc_key}, P&R naive: {naive_precision:.2f}, {naive_recall:.2f}, P&R red: {red_precision:.2f}, {red_recall:.2f}")

Method: 1 call, 1 prompt, Document: synth_contract_000, P&R naive: 0.53, 0.77, P&R red: 0.63, 0.77
Method: 1 call, 1 prompt, Document: synth_contract_001, P&R naive: 0.64, 0.82, P&R red: 0.64, 0.82
Method: 1 call, 1 prompt, Document: synth_contract_002, P&R naive: 0.69, 0.75, P&R red: 0.69, 0.75
Method: 1 call, 1 prompt, Document: synth_contract_003, P&R naive: 0.68, 0.81, P&R red: 0.68, 0.90
Method: 1 call, 1 prompt, Document: synth_contract_004, P&R naive: 0.67, 0.73, P&R red: 0.75, 0.82
Method: 1 call, 1 prompt, Document: synth_contract_005, P&R naive: 0.42, 0.80, P&R red: 0.53, 0.90
Method: 1 call, 1 prompt, Document: synth_contract_006, P&R naive: 0.48, 0.83, P&R red: 0.62, 0.83
Method: 1 call, 1 prompt, Document: synth_contract_007, P&R naive: 0.64, 0.60, P&R red: 0.64, 0.73
Method: 1 call, 1 prompt, Document: synth_contract_008, P&R naive: 0.79, 0.79, P&R red: 0.79, 0.79
Method: 1 call, 1 prompt, Document: synth_contract_009, P&R naive: 0.61, 0.92, P&R red: 0.67, 0.92
Method: 1 

In [4]:
average_metrics = {}

for method_key in results:
    total_naive_precision = 0
    total_naive_recall = 0
    total_red_precision = 0
    total_red_recall = 0
    doc_count = len(results[method_key])
    
    for doc_key in results[method_key]:
        total_naive_precision += results[method_key][doc_key]["naive_precision"]
        total_naive_recall += results[method_key][doc_key]["naive_recall"]
        total_red_precision += results[method_key][doc_key]["red_precision"]
        total_red_recall += results[method_key][doc_key]["red_recall"]
    
    average_metrics[method_key] = {
        "avg_naive_precision": total_naive_precision / doc_count,
        "avg_naive_recall": total_naive_recall / doc_count,
        "avg_red_precision": total_red_precision / doc_count,
        "avg_red_recall": total_red_recall / doc_count,
    }

for method, metrics in average_metrics.items():
    print(f"Method: {method}")
    print(f"  Avg Naive Precision: {metrics['avg_naive_precision']:.2f}")
    print(f"  Avg Naive Recall: {metrics['avg_naive_recall']:.2f}")
    print(f"  Avg Redaction Precision: {metrics['avg_red_precision']:.2f}")
    print(f"  Avg Redaction Recall: {metrics['avg_red_recall']:.2f}")

Method: 1 call, 1 prompt
  Avg Naive Precision: 0.61
  Avg Naive Recall: 0.78
  Avg Redaction Precision: 0.66
  Avg Redaction Recall: 0.82
Method: 1 call, M prompts
  Avg Naive Precision: 0.83
  Avg Naive Recall: 0.95
  Avg Redaction Precision: 0.84
  Avg Redaction Recall: 0.96
Method: N calls, M prompts, OR
  Avg Naive Precision: 0.83
  Avg Naive Recall: 0.95
  Avg Redaction Precision: 0.84
  Avg Redaction Recall: 0.96
Method: N calls, M prompts, AND
  Avg Naive Precision: 0.86
  Avg Naive Recall: 0.95
  Avg Redaction Precision: 0.86
  Avg Redaction Recall: 0.96
