In [5]:
!export CONTAINER=native

/bin/bash: line 0: export: `=native': not a valid identifier


In [2]:
from generator.datamodule import *

[2023-10-11 21:38:01,742] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [20]:
from transformers import AutoTokenizer, ByT5Tokenizer

In [21]:
t = ByT5Tokenizer()

In [22]:
t.vocab_size

256

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("kaiyuy/leandojo-lean3-tacgen-byt5-small")       # Or "lean3" -> "lean4"
model = AutoModelForSeq2SeqLM.from_pretrained("kaiyuy/leandojo-lean3-tacgen-byt5-small")

In [5]:
type(model)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration

In [6]:
type(model.encoder)

transformers.models.t5.modeling_t5.T5Stack

In [3]:
state = "n : ℕ\n⊢ gcd n n = n"
tokenized_state = tokenizer(state, return_tensors="pt")

In [5]:
model.encoder(tokenized_state.input_ids)['last_hidden_state'].shape

torch.Size([1, 24, 1472])

In [8]:
tactic_candidates_ids = model.generate(
    tokenized_state.input_ids,
    max_length=1024,
    num_beams=4,
    length_penalty=0.0,
    do_sample=False,
    num_return_sequences=4,
    early_stopping=False,
)
tactic_candidates = tokenizer.batch_decode(
    tactic_candidates_ids, skip_special_tokens=True
)
for tac in tactic_candidates:
    print(tac)

simp [<a>nat.gcd</a>]
unfold gcd
rw [<a>nat.gcd_comm</a>]
rw [<a>nat.gcd</a>, <a>nat.gcd_self_right</a>]


tactic_candidates_ids = model.generate(
    encoder_outputs=model.encoder(tokenized_state.input_ids),
    max_length=1024,
    num_beams=4,
    length_penalty=0.0,
    do_sample=False,
    num_return_sequences=4,
    early_stopping=False,
)
tactic_candidates = tokenizer.batch_decode(
    tactic_candidates_ids, skip_special_tokens=True
)
for tac in tactic_candidates:
    print(tac)

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("kaiyuy/leandojo-lean3-retriever-tacgen-byt5-small")
model = T5ForConditionalGeneration.from_pretrained("kaiyuy/leandojo-lean3-retriever-tacgen-byt5-small")

state = "n : ℕ\n⊢ gcd n n = n"
retrieved_premises = [
  "def <a>nat.gcd</a> : nat → nat → nat\n| 0        y := y\n| (succ x) y := have y % succ x < succ x, from mod_lt _ $ succ_pos _,\n                gcd (y % succ x) (succ x)",
  "@[simp] theorem <a>nat.mod_self</a> (n : nat) : n % n = 0",
]
input = "\n\n".join(retrieved_premises + [state])
print("------ INPUT ------\n", input)
tokenized_input = tokenizer(input, return_tensors="pt", max_length=2300, truncation=True)

# Loss
labels = tokenizer("cases n", return_tensors="pt", max_length=2300, truncation=True).input_ids
print(tokenized_input.attention_mask.shape)
print(tokenized_input.input_ids.shape)
print(labels.shape)
print(model(tokenized_input.input_ids, tokenized_input.attention_mask, labels=labels).loss)

# Generate a single tactic.
tactic_ids = model.generate(tokenized_input.input_ids, max_length=1024)
tactic = tokenizer.decode(tactic_ids[0], skip_special_tokens=True)
print("\n------ OUTPUT ------")
print(tactic, end="\n\n")

# Generate multiple tactics via beam search.
tactic_candidates_ids = model.generate(
    tokenized_input.input_ids,
    max_length=1024,
    num_beams=4,
    length_penalty=0.0,
    do_sample=False,
    num_return_sequences=4,
    early_stopping=False,
)
tactic_candidates = tokenizer.batch_decode(
    tactic_candidates_ids, skip_special_tokens=True
)
for tac in tactic_candidates:
    print(tac)

------ INPUT ------
 def <a>nat.gcd</a> : nat → nat → nat
| 0        y := y
| (succ x) y := have y % succ x < succ x, from mod_lt _ $ succ_pos _,
                gcd (y % succ x) (succ x)

@[simp] theorem <a>nat.mod_self</a> (n : nat) : n % n = 0

n : ℕ
⊢ gcd n n = n
torch.Size([1, 255])
torch.Size([1, 255])
torch.Size([1, 8])
tensor(0.1879, grad_fn=<NllLossBackward0>)

------ OUTPUT ------
cases n

cases n
simp [<a>nat.gcd</a>]
induction n with n ih
induction n with n IH


### RMT debug

In [1]:
from generator.datamodule import MultipleSegmentGeneratorDataModule
from generator.model import RMTRetrievalAugmentedGenerator

[2023-10-13 10:04:24,532] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
data = MultipleSegmentGeneratorDataModule(
    model_name="kaiyuy/leandojo-lean3-retriever-byt5-small",
    data_path="data/leandojo_benchmark/random/",
    corpus_path="data/leandojo_benchmark/corpus.jsonl",
    keep_marks=True,
    preds_path="pred_random.pickle",
    batch_size=8,  # effective_batch_size == batch_size * accumulate_grad_batches * devices
    eval_batch_size=64,
    max_seq_len=2280,
    num_segments=2,
    p_drop=0.5,
    normalize_tactics=True,
    num_workers=2,
)
data.setup()

train_dataloader = data.train_dataloader()

[32m2023-10-13 10:04:24.957[0m | [1mINFO    [0m | [36mcommon[0m:[36m__init__[0m:[36m200[0m - [1mBuilding the corpus from data/leandojo_benchmark/corpus.jsonl[0m
[32m2023-10-13 10:04:37.162[0m | [1mINFO    [0m | [36mgenerator.datamodule[0m:[36m__init__[0m:[36m345[0m - [1mWith retrieval data[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94641/94641 [00:03<00:00, 30544.23it/s]
[32m2023-10-13 10:05:27.605[0m | [1mINFO    [0m | [36mgenerator.datamodule[0m:[36m_load_data[0m:[36m68[0m - [1m207631 examples loaded[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 25139.15it/s]
[32m2023-10-13 10:05:27.834[0m | [1mINFO    [0m | [36mgenerator.datamodule[0m:[36m_load_data[0m:[36m68[0m - [1m4866 examples loaded[0m


In [3]:
model = RMTRetrievalAugmentedGenerator(
    backbone_model_name="kaiyuy/leandojo-lean3-retriever-byt5-small",
    num_memory_tokens=10,
    lr=5e-4,
    warmup_steps=2000,
    num_beams=1,
    length_penalty=0.0,
    ret_ckpt_path="../leandojo-pl-ckpts/retriever_random.ckpt",
    eval_num_retrieved=100,
    eval_num_cpus=12,
    eval_num_theorems=200,
    max_seq_len=2300,
    num_segments=2,
)

[32m2023-10-13 10:05:27.845[0m | [1mINFO    [0m | [36mgenerator.model[0m:[36m__init__[0m:[36m110[0m - [1mLoading the retriever from ../leandojo-pl-ckpts/retriever_random.ckpt[0m


Processing zero checkpoint '../leandojo-pl-ckpts/retriever_random.ckpt/checkpoint'
Detected checkpoint of type zero stage 2, world_size: 1
Parsing checkpoint created by deepspeed==0.9.2
Reconstructed fp32 state dict with 111 params 217657472 elements
Saving fp32 state dict to /tmp/tmpqk1igna0/lightning.cpkt


  rank_zero_warn(
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at kaiyuy/leandojo-lean3-retriever-byt5-small and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.3.layer.0.SelfAttention.q.weight', 'decoder.block.3.layer.0.layer_norm.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.2.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.1.EncDecAttention.o.weight', 'decoder.block.1.layer.0.layer_norm.weight', 'decoder.block.0.l

In [None]:
ex = None
for t in train_dataloader:
    ex = t
    break

In [None]:
model.forward(
    ex["state_ids"],
    ex["state_mask"],
    ex["tactic_ids"],
)

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tactic_candidates_ids = model.generate(
    state="i : int\n⊢ gcd i i = nat_abs i",
    file_path="src/data/int/gcd.lean",
    theorem_full_name="int.gcd_self",
    theorem_pos=(195,1),
    num_samples=4,
)
tactic_candidates = tokenizer.batch_decode(
    tactic_candidates_ids, skip_special_tokens=True
)
for tac in tactic_candidates:
    print(tac)

[32m2023-10-13 01:49:10.392[0m | [1mINFO    [0m | [36mretrieval.model[0m:[36mreindex_corpus[0m:[36m172[0m - [1mRe-indexing the retrieval corpus[0m


AttributeError: 'PremiseRetriever' object has no attribute 'corpus'

In [7]:
type(model)

generator.model.RMTRetrievalAugmentedGenerator

In [11]:
type(data.preds)

dict

In [9]:
key = list(data.preds.keys())[0]
print(key)
prem = data.preds[key]['retrieved_premises'][0]

('src/analysis/bounded_variation.lean', 'variation_on_from_to.self', 'α : Type u_1,\n_inst_1 : linear_order α,\nE : Type u_3,\n_inst_3 : pseudo_emetric_space E,\nf : α → E,\ns : set α,\na : α\n⊢ variation_on_from_to f s a a = 0')


In [11]:
prem.start

(737, 1)

In [22]:
data.ds_train[4]

{'url': 'https://github.com/leanprover-community/mathlib',
 'commit': '32a7e535287f9c73f2e4d2aef306a39190f0b504',
 'file_path': 'src/analysis/calculus/diff_cont_on_cl.lean',
 'full_name': 'diff_cont_on_cl.continuous_on_ball',
 'state': ["def <a>metric.ball</a> (x : α) (ε : ℝ) : set α := {y | dist y x < ε}\n\nlemma <a>balanced_ball_zero</a> : balanced 𝕜 (metric.ball (0 : E) r)\n\n@[simp] theorem <a>metric.closed_ball_diff_sphere</a> : closed_ball x ε \\ sphere x ε = ball x ε\n\ndef <a>metric.sphere</a> (x : α) (ε : ℝ) := {y | dist y x = ε}\n\ntheorem <a>int.preimage_closed_ball</a> (x : ℤ) (r : ℝ) :\n  coe ⁻¹' (closed_ball (x : ℝ) r) = closed_ball x r\n\nlemma <a>metric.closed_ball_eq_bInter_ball</a> : closed_ball x ε = ⋂ δ > ε, ball x δ\n\nlemma <a>real.closed_ball_eq_Icc</a> {x r : ℝ} : closed_ball x r = Icc (x - r) (x + r)\n\ntheorem <a>metric.closed_ball_eq_sphere_of_nonpos</a> (hε : ε ≤ 0) : closed_ball x ε = sphere x ε\n\nlemma <a>seminorm.preimage_metric_closed_ball</a> {r : ℝ} :

In [7]:
import torch

In [40]:
input_ids = torch.arange(5).unsqueeze(0)
labels = torch.arange(5, 12).unsqueeze(0)
attention_mask = torch.ones(5)
attention_mask[[0, 2, 4]] = 0
attention_mask = attention_mask.unsqueeze(0)

In [41]:
out = model.generator.forward(
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=labels,
    #output_hidden_states=True,
)

In [42]:
out.loss

tensor(105.0325, grad_fn=<NllLossBackward0>)

In [39]:
out.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'decoder_hidden_states', 'encoder_last_hidden_state', 'encoder_hidden_states'])

In [44]:
out = model.generator.forward(
    encoder_outputs=(enc_out,),
    attention_mask=attention_mask,
    labels=labels
)

In [45]:
out.loss

tensor(105.0325, grad_fn=<NllLossBackward0>)