# `init_state -> actions -> final_state`

Gather a dataset of triplets `(initial state, actions, final state)` for each pair of initial and final states

In [1]:
from pathlib import Path

import pandas as pd

DATA_DIRECTORY = Path("./data/")  # change as necessary

## 1. Load tactics with states before and after

In [2]:
goals = pd.read_parquet(DATA_DIRECTORY / "goals_before_after.parquet").drop_duplicates()

## 2. Group tactics by `parent_key`

### 3. Build triplets

1. Groupby tactics by `parent_key`.
2. In each group do the following:  
    2.1. Sort by `tactic_instance` (considering it as a vector of ints).  
    2.2. For each consequitive elements with `goal_before[i] == goal_after[i - 1]` construct all triplets.

In [3]:
def _triplets(gg):
    triplets = []
    for i in range(len(gg)):
        for j in range(i + 1, len(gg)):
            tactic = ' [PROOFSTEP] '.join(gg.human_tactic_code.values[i:j + 1])
            triplets.append((
                gg.goal_before.values[i],
                tactic,
                gg.goal_after.values[j],
                gg.decl_name.values[i],
                gg.split.values[i]
            ))
    triplets = pd.DataFrame(triplets, columns=['goal_before', 'tactic', 'goal_after', 'decl_name', 'split'])
    return triplets


def get_triplets(group):
    # 1. sort by tactic_instance
    for i in range(4):
        group[f'tac_inst_{i}'] = group.tactic_instance.apply(lambda x: int(x.split(':')[i]))
    group = group.sort_values([f"tac_inst_{i}" for i in range(4)])
    triplets = [group[['goal_before', 'human_tactic_code', 'goal_after', 'decl_name', 'split']]
                .rename(columns={'human_tactic_code': 'tactic'})]
    i = 0
    while i < len(group):
        j = i + 1
        while j < len(group) and group.goal_after.iloc[j - 1] == group.goal_before.iloc[j]:
            j += 1
        triplets.append(_triplets(group.iloc[i:j]))  
        i = j  
        
    return pd.concat(triplets, ignore_index=True)

In [4]:
import tqdm
from joblib import Parallel, delayed


triplets = Parallel(-1)(
    delayed(get_triplets)(group) for _, group in tqdm.tqdm(goals.groupby('parent_key'))
)

100%|██████████| 98627/98627 [03:04<00:00, 534.23it/s]


In [5]:
triplets = pd.concat(triplets, ignore_index=True)
print(triplets.shape)
triplets.head()

(613721, 5)


Unnamed: 0,goal_before,tactic,goal_after,decl_name,split
0,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b; unfold read read'; simp [array.read_e...,,buffer.read_eq_read',test
1,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',test
2,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",unfold read read',"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',test
3,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b [PROOFSTEP] unfold read read',"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',test
4,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b; unfold read read',"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',test


In [6]:
triplets.to_parquet(DATA_DIRECTORY / 'triplets_by_steps.parquet', engine='pyarrow', index=False)

In [7]:
proof_length = []
for name, group in goals.groupby('parent_key'):
    if group.parent_type.iloc[0] == 'proof':
        proof_length.append(len(group))
        if proof_length[-1] == 10:
            break

In [None]:
tr = []
for _, gg in goals[goals.decl_name == 'int.neg_succ_of_nat_mul_sub_nat_nat'].copy().groupby('parent_key'):
    tr.append(get_triplets(gg))

tr = pd.concat(tr)
tr

In [None]:
df = triplets[triplets.decl_name == 'int.neg_succ_of_nat_mul_sub_nat_nat']
df

In [None]:
from matplotlib import pyplot as plt

plt.hist(proof_length, bins=40);

In [None]:
proof_length = pd.DataFrame(proof_length)
proof_length.describe()