In [13]:
import json
import os
import re

# Read Jsonl files and see how many have "is_proven" as True
def read_jsonl(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)

def load_jsonl(file_path):
    data = []
    for item in read_jsonl(file_path):
        data.append(item)
    filename = os.path.basename(file_path)
    return filename, data

def count_proven(data):
    count = 0
    for item in data:
        if item.get("is_proven", True):
            count += 1
    return count

def count_compiled(data):
    count = 0
    for item in data:
        if item.get("compiles", True):
            count += 1
    return count

def count_proof_lines(content):
    # Look for the first pattern
    # :=

    # -- <some comment> --

    # by
    regex = r":=\s*?--[\s|\S]*?by"
    # Find the first match
    match = re.search(regex, content)
    if match:
        # Get the start and end positions of the match
        start, end = match.span()
        # Extract just the remaining content
        substring = content[end:]
        # Remove all empty lines
        lines = substring.splitlines()
        lines = [line for line in lines if line.strip()]
        # Count the number of lines
        return len(lines)
    else:
        return 0

def count_induction_proofs(proofs):
    count = 0
    for proof in proofs:
        if "induction" in proof.lower():
            count += 1
    return count

In [None]:
data_paths = [
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_few_shot_proof.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_few_shot_proof/2025-05-09_17-05-24/test_report/2025-05-09_17-05-25"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_few_shot_proof.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_few_shot_proof/2025-05-11_05-18-54/test_report/2025-05-11_05-18-55"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_few_shot_proof_o4_mini.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_few_shot_proof_o4_mini/2025-05-14_03-16-19/test_report/2025-05-14_03-16-21"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_few_shot_proof_o4_mini.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_few_shot_proof_o4_mini/2025-05-13_18-07-23/test_report/2025-05-13_18-07-25"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_few_shot_proof_claude_3_7.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_few_shot_proof_claude_3_7/2025-05-14_06-53-46/test_report/2025-05-14_06-53-48"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_few_shot_proof_claude_3_7.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_few_shot_proof_claude_3_7/2025-05-13_23-35-36/test_report/2025-05-13_23-35-37"
    ),
    (   "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_copra_proof.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_copra_proof/2025-05-13_01-31-57/test_report/2025-05-13_01-31-59"
    ),
    (   "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_copra_proof.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_copra_proof/2025-05-15_06-17-49/test_report/2025-05-15_06-17-50"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_copra_proof_claude_3_7.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_copra_proof_claude_3_7/2025-05-14_18-41-28/test_report/2025-05-14_18-41-30"
    ),
    ( 
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_copra_proof_claude_3_7.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_copra_proof_claude_3_7/2025-05-14_11-06-10/test_report/2025-05-14_11-06-11"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_spec_few_shot_proof_deepseek_r1.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_spec_few_shot_proof_deepseek_r1/2025-05-16_01-15-40/test_report/2025-05-16_01-15-42"
    ),
    (
        "/home/<user_name>/Project/clever-prover/.logs/checkpoints/few_shot_impl_few_shot_proof_deepseek_r1.jsonl",
        "/home/<user_name>/Project/clever-prover/.logs/eval_few_shot_impl_few_shot_proof_deepseek_r1/2025-05-15_11-00-46/test_report/2025-05-15_11-00-48"
    )
]
induction_proofs = []
for data_path, proofs_path in data_paths:
    print("--" * 20)
    dataset_name, data = load_jsonl(data_path)
    print(f"Dataset: {dataset_name}")
    print(f"Number of items: {len(data)}")
    print(f"{count_proven(data) / len(data) * 100:.3f}% of items are proven")
    print(f"{count_compiled(data) / len(data) * 100:.3f}% of items are compiled")
    print(f"Number of proven items: {count_proven(data)}")
    print(f"Number of compiled items: {count_compiled(data)}")
    proved_ids = [d["problem_id"] for d in data if d.get("is_proven", True)]
    average_proof_time = [d["proof_time"] for d in data if d.get("is_proven", True)]
    average_generation_time = [d["generation_time"] for d in data if d.get("compiles", True)]
    print(f"Average proof time: {sum(average_proof_time) / len(average_proof_time) if average_proof_time else 0}")
    print(f"Average generation time: {sum(average_generation_time) / len(average_generation_time) if average_generation_time else 0}")
    # Read all proofs generated
    proofs = {}
    num_induction_proofs = 0
    for proof_file in os.listdir(proofs_path):
        with open(os.path.join(proofs_path, proof_file), 'r') as f:
            problem_id = proof_file.split("_")[-1][:-len(".lean")]
            try:
                problem_id = int(problem_id)
            except Exception:
                print(f"Problem ID: {proof_file} is not an integer")
                raise
            content = f.read()
            if "sorry" in content and problem_id in proved_ids:
                print(f"Problem ID: {problem_id}, Sorry found")
                continue
            # Count the number of lines in the proof
            proof_lines = count_proof_lines(content)
            if problem_id in proved_ids:
                num_induction_proofs += 1
                induction_proofs.append((problem_id, dataset_name, proof_lines, content))
            if problem_id in proved_ids:
                proofs[problem_id] = proof_lines
    # Print the number of lines in each proof
    for problem_id, proof_lines in proofs.items():
        print(f"Problem ID: {problem_id}, Proof lines: {proof_lines}")
    print(f"Number of induction proofs: {num_induction_proofs}")
    # Print the average number of lines in the proofs
    average_proof_lines = sum(proofs.values()) / len(proofs) if proofs else 0
    print(f"Average proof lines: {average_proof_lines}")
    max_proof_lines = max(proofs.values()) if proofs else 0
    print(f"Max proof lines: {max_proof_lines}")
    min_proof_lines = min(proofs.values()) if proofs else 0
    print(f"Min proof lines: {min_proof_lines}")
# Print all induction proofs
for problem_id, dataset_name, proof_lines, content in induction_proofs:
    print()
    print()
    print(f"-- Problem ID: {problem_id}, Dataset: {dataset_name}, Proof lines: {proof_lines}")
    # Print the proof
    print(content)
    print("--" * 20)

----------------------------------------
Dataset: few_shot_spec_few_shot_proof.jsonl
Number of items: 160
0.625% of items are proven
85.000% of items are compiled
Number of proven items: 1
Number of compiled items: 136
Average proof time: 124.26517987251282
Average generation time: 68.38005905291614
Problem ID: 42, Proof lines: 16
Number of induction proofs: 1
Average proof lines: 16.0
Max proof lines: 16
Min proof lines: 16
----------------------------------------
Dataset: few_shot_impl_few_shot_proof.jsonl
Number of items: 160
0.625% of items are proven
68.750% of items are compiled
Number of proven items: 1
Number of compiled items: 110
Average proof time: 291.63309359550476
Average generation time: 51.351814111796294
Problem ID: 41, Proof lines: 6
Number of induction proofs: 1
Average proof lines: 6.0
Max proof lines: 6
Min proof lines: 6
----------------------------------------
Dataset: few_shot_spec_few_shot_proof_o4_mini.jsonl
Number of items: 160
1.250% of items are proven
83.1