# The datasets

This is a quick look at the datasets generated in this project.

In [1]:
from pathlib import Path

import pandas as pd

DATA_DIRECTORY = Path("./data/")  # change as necessary

# Combine data

1. Load cleaned training data
2. Load raw tactics and tactic states data
3. Join them to have both tactics, states before and after and train/val/test split

## 1. Load cleaned training data

In [2]:
data_and_metadata = pd.read_csv(DATA_DIRECTORY / "cleaned_training_data" / "data_and_metadata.csv")

## 2. Load tactics raw data

In [3]:
tactic_state_goal_raw = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_state_goal.jsonl",
    orient='records',
    lines=True
)
tactic_state_raw = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_state.jsonl",
    orient="records",
    lines=True 
)

tactics_raw = pd.read_json(
    DATA_DIRECTORY / "extracted_proof_data" / "tactics.jsonl", 
    orient='records',
    lines=True
)

tactics_raw = pd.merge(tactics_raw, data_and_metadata, on=['filename', 'line', 'column', 'proof_key'])

## 3. Join data

Here we combine the above data sources to make a table containing:
* First goals in the goal stack before and after the tactic is executed
* The human written tactic command
* A metadata about the type of tactic, declaration and train/val/test split

In [4]:
goals = tactic_state_goal_raw.copy()
goals = goals[goals['ix'] == 0]

# set unique index
goals['tactic_state_key'] = goals['filename'] + ":" + goals['tactic_state']
goals = goals[['tactic_state_key', 'goal_pp']]
goals = goals.set_index('tactic_state_key')

#### Add `tactic_key` from `tactic_state` table

In [5]:
states = tactic_state_raw.copy()
# df2 = df2[df2['before_after'] == 'before']
states['tactic_state_key'] = states['filename'] + ":" + states['key']
states['tactic_instance_key'] = states['filename'] + ":" + states['tactic_instance']
states['tactic_key'] = states['tactic_instance_key'].apply(lambda k: ":".join(k.split(":")[:-1]))
states = states[['tactic_state_key', 'tactic_key', 'tactic_instance_key', 'before_after', 'tactic_instance']]
states = states.set_index('tactic_state_key')

print(len(goals), len(states))
goals = goals.join(states)
goals = goals.set_index('tactic_key')
goals.shape

481088 481088


(481088, 4)

#### join rows by `tactic_key` and `tactic_instance`

In [6]:
def join_before_after(group):
    if len(group) == 1:
        return None
    if len(group) > 2:
        raise ValueError
    
    before = group[group.before_after == 'before'].copy()
    after = group[group.before_after == 'after']
    before['goal_before'] = before['goal_pp']
    before['goal_after'] = after['goal_pp']
    return before

In [7]:
import tqdm
from joblib import Parallel, delayed


result = Parallel(-1)(
    delayed(join_before_after)(group)
    for _, group in tqdm.tqdm(goals.groupby(['tactic_key', 'tactic_instance']))
)
goals = pd.concat(result).drop(columns=['goal_pp', 'before_after'])

100%|██████████| 240974/240974 [02:13<00:00, 1801.91it/s]


In [8]:
tactics = tactics_raw.copy()
tactics['tactic_key'] = tactics['filename'] + ":" + tactics['trace_key']
tactics = tactics.set_index('tactic_key')

In [9]:
tactics.index.shape, len(set(tactics.index)), len(set(tactics.index).intersection(goals.index))

((240484,), 226835, 226819)

In [10]:
joined = pd.merge(
    goals, tactics,
    how='inner',
    left_on=['tactic_key', 'goal_before'],
    right_on=['tactic_key', 'goal_pp']
)
joined.shape

(240404, 27)

In [11]:
_joined = (joined
          .set_index('tactic_instance_key')
          .drop(columns=['key', 'end_line', 'end_column',
                         'code_string', 'index',
                         'Unnamed: 0',
                         'goal_pp', 'cleaned_goal', 
                         'filename', ])
)

In [13]:
_joined.to_parquet(DATA_DIRECTORY / 'goals_before_after.parquet', engine='pyarrow')