NeurIPS 2023 Tutorial on Machine Learning for Theorem Proving
=============================================================

In [1]:
import torch
import random
import numpy as np
from tqdm import tqdm
from lean_dojo import *
from datasets import Dataset
from typing import List, Dict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# https://arxiv.org/abs/2109.08203
random.seed(3407)
np.random.seed(3407)
torch.manual_seed(3407)

<torch._C.Generator at 0x7f0f3f768fd0>

## Data Extraction

We use [LeanDojo](https://leandojo.org/) to extract state-tactic pairs from mathlib.

In [2]:
repo = LeanGitRepo(
    "https://github.com/leanprover-community/mathlib4",
    "3ce43c18f614b76e161f911b75a3e1ef641620ff",
)

repo.show()

In [None]:
traced_repo = trace(repo)  # A few minutes, depending on #CPUs.

[32m2023-11-27 06:58:20.091[0m | [1mINFO    [0m | [36mlean_dojo.data_extraction.trace[0m:[36mtrace[0m:[36m182[0m - [1mLoading the traced repo from /home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib4-3ce43c18f614b76e161f911b75a3e1ef641620ff/mathlib4[0m
2023-11-27 06:58:22,743	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
100%|██████████████████████████████████████████████████| 4462/4462 [09:35<00:00,  7.75it/s]
Following Github server redirection from /repos/mhuisi/lean4-cli to /repositories/341363356
Following Github server redirection from /repos/mhuisi/lean4-cli to /repositories/341363356
Following Github server redirection from /repos/mhuisi/lean4-cli to /repositories/341363356
Following Github server redirection from /repos/mhuisi/lean4-cli to /repositories/341363356
Following Github server redirection from /repos/mhuisi/lean4-cli to /repositories/341363356
Following Github server redirection from /r

In [None]:
theorems = traced_repo.get_traced_theorems()
state_tactic_pairs = []

for thm in tqdm(theorems):
  for t in thm.get_traced_tactics():
    state_tactic_pairs.append({
        "state": t.state_before, 
        "tactic": t.tactic
    })

print(f"{len(state_tactic_pairs)} state-tactic pairs")

In [14]:
st = state_tactic_pairs[0]
st

{'state': 'α : Type u_1\nβ : Type u_2\nks : Array α\nvs : Array β\nh : Array.size ks = Array.size vs\ni : Fin (Array.size ks)\nj : Fin (Array.size vs)\nk : α\nv : β\n⊢ Array.size (Array.set ks i k) = Array.size (Array.set vs j v)',
 'tactic': 'simp [h]'}

In [15]:
print(st["state"])

α : Type u_1
β : Type u_2
ks : Array α
vs : Array β
h : Array.size ks = Array.size vs
i : Fin (Array.size ks)
j : Fin (Array.size vs)
k : α
v : β
⊢ Array.size (Array.set ks i k) = Array.size (Array.set vs j v)


In [17]:
print(st["tactic"])

simp [h]


## Finetuning Language Models for Tactic Generation

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [25]:
def tokenize(examples):
  model_inputs = tokenizer(examples["state"], max_length=2048, truncation=True)
  labels = tokenizer(text_target=examples["tactic"], max_length=2048, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

dataset = Dataset.from_list(state_tactic_pairs).shuffle().select(range(10000))
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['state', 'tactic', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [26]:
# This is just an example. 
training_args = Seq2SeqTrainingArguments(output_dir="./results", learning_rate=1e-5, per_device_train_batch_size=8, max_steps=2, use_cpu=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


TrainOutput(global_step=2, training_loss=4.023161888122559, metrics={'train_runtime': 74.8826, 'train_samples_per_second': 0.214, 'train_steps_per_second': 0.027, 'total_flos': 39750274676736.0, 'train_loss': 4.023161888122559, 'epoch': 0.0})

## Inspecting the Trained Tactic Generator

In [27]:
tokenizer = AutoTokenizer.from_pretrained("kaiyuy/leandojo-lean4-tacgen-byt5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("kaiyuy/leandojo-lean4-tacgen-byt5-small")

In [32]:
type(model)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration

In [33]:
type(tokenizer)

transformers.models.byt5.tokenization_byt5.ByT5Tokenizer

In [38]:
def generate_one_tactic(state: str) -> str:
    """Generate a single tactic."""
    tokenized_state = tokenizer(state, return_tensors="pt")
    tactic_ids = model.generate(tokenized_state.input_ids, max_length=1024)
    tactic = tokenizer.decode(tactic_ids[0], skip_special_tokens=True)
    print(tactic, end="\n\n")

generate_one_tactic("n : ℕ\n⊢ gcd n n = n")

rw [gcd_comm]



In [39]:
def generate_tactics(state: str, k: int = 8) -> List[str]:
    """Generate multiple tactics via beam search."""
    tactic_candidates_ids = model.generate(
        tokenized_state.input_ids,
        max_length=1024,
        num_beams=k,
        length_penalty=0.0,
        do_sample=False,
        num_return_sequences=k,
        early_stopping=False,
    )
    tactic_candidates = tokenizer.batch_decode(
        tactic_candidates_ids, skip_special_tokens=True
    )
    return tactic_candidates

for tac in generate_tactics("n : ℕ\n⊢ gcd n n = n"):
    print(tac)

rw [gcd_comm]
induction' n with n IH
induction' n with n hn
cases n
rw [gcd]
induction' n with n ih
unfold gcd
rw [gcd_comm, gcd_gcd_self_right]


## Interacting with Lean

In [41]:
repo = LeanGitRepo("https://github.com/yangky11/lean4-example", "dc7e97b1c919555865b1b20ddda38267037d3814")
theorem = Theorem(repo, "Gcd.lean", "Hidden.gcd_self")

# For some theorems, it might take a few minutes.
dojo, state_0 = Dojo(theorem).__enter__()



DojoInitError: Cannot find the *.ast.json file for Theorem(repo=LeanGitRepo(url='https://github.com/yangky11/lean4-example', commit='dc7e97b1c919555865b1b20ddda38267037d3814'), file_path=PosixPath('Gcd.lean'), full_name='Hidden.gcd_self') in /home/kaiyu/.cache/lean_dojo/yangky11-lean4-example-dc7e97b1c919555865b1b20ddda38267037d3814/lean4-example.

In [16]:
print(state_0.pp)

R✝ : Type u_1
R₁ : Type u_2
R₂ : Type u_3
R₃ : Type u_4
R₄ : Type u_5
S : Type u_6
K : Type u_7
K₂ : Type u_8
M : Type u_9
M' : Type u_10
M₁ : Type u_11
M₂ : Type u_12
M₃ : Type u_13
M₄ : Type u_14
N : Type u_15
N₂ : Type u_16
ι✝ : Type u_17
V : Type u_18
V₂ : Type u_19
ι : Type u_20
inst✝² : Fintype ι
inst✝¹ : DecidableEq ι
R : Type u_21
inst✝ : Semiring R
x : ι → R
⊢ x = ∑ i : ι, x i • fun j => if i = j then 1 else 0


In [17]:
state_1 = dojo.run_tac(state_0, "revert n")

print(state_1.pp)

R✝ : Type u_1
R₁ : Type u_2
R₂ : Type u_3
R₃ : Type u_4
R₄ : Type u_5
S : Type u_6
K : Type u_7
K₂ : Type u_8
M : Type u_9
M' : Type u_10
M₁ : Type u_11
M₂ : Type u_12
M₃ : Type u_13
M₄ : Type u_14
N : Type u_15
N₂ : Type u_16
ι✝ : Type u_17
V : Type u_18
V₂ : Type u_19
ι : Type u_20
inst✝² : Fintype ι
inst✝¹ : DecidableEq ι
R : Type u_21
inst✝ : Semiring R
⊢ ∀ (x : ι → R), x = ∑ i : ι, x i • fun j => if i = j then 1 else 0


In [18]:
state_2 = dojo.run_tac(state_0, "hello world!")

state_2

LeanError(error='<stdin>:1:1: unknown tactic')

In [19]:
dojo.run_tac(state_2, "skip")

RuntimeError: Attempting to run a tactic on an invalid state LeanError(error='<stdin>:1:1: unknown tactic').

In [None]:
state_3 = dojo.run_tac(state_0, "cases n")

In [None]:
state_4 = dojo.run_tac(state_3, "simp [gcd, mod_self]")

In [None]:
state_5 = dojo.run_tac(state_4, "simp [gcd, mod_self]")

state_5

## Using the Model in Lean