# Convert dataset to Parquet 

In [1]:
import os
import json
import pandas as pd
import argparse
from pathlib import Path
from tqdm import tqdm

def convert_json_to_parquet(input_dir, output_dir):
    """
    Recursively finds all JSON files in input_dir and converts them to Parquet,
    maintaining the same directory structure in output_dir.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Find all JSON files recursively
    json_files = list(Path(input_dir).glob('**/*.json'))
    
    print(f"Found {len(json_files)} JSON files to convert")
    
    for json_path in tqdm(json_files):
        # Calculate the relative path to maintain directory structure
        rel_path = json_path.relative_to(input_dir)
        output_path = Path(output_dir) / rel_path.with_suffix('.parquet')
        
        # Create the parent directories if they don't exist
        os.makedirs(output_path.parent, exist_ok=True)
        
        # Load the JSON file
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        # If it's a list of records, convert directly
        if isinstance(data, list):
            df = pd.DataFrame(data)
        # If it's a single record, wrap it in a list
        else:
            df = pd.DataFrame([data])
        
        # Write to Parquet
        df.to_parquet(output_path, index=False)
        
        print(f"Converted {json_path} to {output_path}")

if __name__ == "__main__":
    convert_json_to_parquet("before_arc_datasets/", "before-arc-parquet/")
    # convert_json_to_parquet("before-arc-parquet/compositionality/", "before-arc-parquet/compositionality/")
    print("Conversion complete!")

Found 4 JSON files to convert


 50%|█████     | 2/4 [00:00<00:00, 15.66it/s]

Converted before_arc_datasets/compositionality/exp_setting_2/experiment_4/test_ood.json to before-arc-parquet/compositionality/exp_setting_2/experiment_4/test_ood.parquet
Converted before_arc_datasets/compositionality/exp_setting_2/experiment_4/val_ood.json to before-arc-parquet/compositionality/exp_setting_2/experiment_4/val_ood.parquet
Converted before_arc_datasets/compositionality/exp_setting_1/experiment_4/test_ood.json to before-arc-parquet/compositionality/exp_setting_1/experiment_4/test_ood.parquet


100%|██████████| 4/4 [00:00<00:00, 19.93it/s]

Converted before_arc_datasets/compositionality/exp_setting_1/experiment_4/val_ood.json to before-arc-parquet/compositionality/exp_setting_1/experiment_4/val_ood.parquet
Conversion complete!





# Upload dataset to Hugging Face

In [2]:
from huggingface_hub import HfApi
import os
api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="before-arc-parquet/",
    repo_id="taratataw/before-arc",
    repo_type="dataset",
    commit_message="Upload new version of experiments",
    ignore_patterns=["*.db"]
)

  from .autonotebook import tqdm as notebook_tqdm
test_ood.parquet:   0%|          | 0.00/89.4k [00:00<?, ?B/s]
[A

[A[A


test_ood.parquet:  18%|█▊        | 16.4k/89.4k [00:00<00:00, 79.1kB/s]


[A[A[A
val_ood.parquet: 100%|██████████| 89.6k/89.6k [00:00<00:00, 102kB/s] 
val_ood.parquet: 100%|██████████| 89.2k/89.2k [00:00<00:00, 99.7kB/s]
test_ood.parquet: 100%|██████████| 89.1k/89.1k [00:00<00:00, 95.7kB/s]
test_ood.parquet: 100%|██████████| 89.4k/89.4k [00:00<00:00, 92.9kB/s]


Upload 4 LFS files: 100%|██████████| 4/4 [00:01<00:00,  3.19it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/taratataw/before-arc/commit/7432e48cffc9f5de28b5caa3894adcc091f92d1b', commit_message='Upload new version of experiments', commit_description='', oid='7432e48cffc9f5de28b5caa3894adcc091f92d1b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/taratataw/before-arc', endpoint='https://huggingface.co', repo_type='dataset', repo_id='taratataw/before-arc'), pr_revision=None, pr_num=None)

# Verify Croissant Meta Data

In [9]:
import requests
# headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://huggingface.co/api/datasets/yassinetb/cogitao/croissant"
def query():
    response = requests.get(API_URL)
    return response.json()
data = query()
print(data)

{'@context': {'@language': 'en', '@vocab': 'https://schema.org/', 'arrayShape': 'cr:arrayShape', 'citeAs': 'cr:citeAs', 'column': 'cr:column', 'conformsTo': 'dct:conformsTo', 'cr': 'http://mlcommons.org/croissant/', 'data': {'@id': 'cr:data', '@type': '@json'}, 'dataBiases': 'cr:dataBiases', 'dataCollection': 'cr:dataCollection', 'dataType': {'@id': 'cr:dataType', '@type': '@vocab'}, 'dct': 'http://purl.org/dc/terms/', 'extract': 'cr:extract', 'field': 'cr:field', 'fileProperty': 'cr:fileProperty', 'fileObject': 'cr:fileObject', 'fileSet': 'cr:fileSet', 'format': 'cr:format', 'includes': 'cr:includes', 'isArray': 'cr:isArray', 'isLiveDataset': 'cr:isLiveDataset', 'jsonPath': 'cr:jsonPath', 'key': 'cr:key', 'md5': 'cr:md5', 'parentField': 'cr:parentField', 'path': 'cr:path', 'personalSensitiveInformation': 'cr:personalSensitiveInformation', 'recordSet': 'cr:recordSet', 'references': 'cr:references', 'regex': 'cr:regex', 'repeated': 'cr:repeated', 'replace': 'cr:replace', 'sc': 'https://

In [11]:
# Save the data to a JSON file
import json
with open("cogitao.croissant.json", "w") as f:
    json.dump(data, f, indent=4)

# Verify All Is Well with The Data

In [None]:
from datasets import load_dataset
dataset = load_dataset("taratataw/before-arc", data_files={"data": "generalization/exp_setting_1/experiment_1/test.parquet"})
print(dataset["data"][0].keys())

  from .autonotebook import tqdm as notebook_tqdm
Generating data split: 1001 examples [00:00, 77636.39 examples/s]


# DEPRECATED - Utils to move things around

In [2]:
# Write some code to rename every .parquet file inside of the before-arc-parquet folder in such a way that "train_val" gets renamed with "val" and "test_val" gets rename with "val_ood"

import os
from pathlib import Path

def rename_parquet_files(input_dir):
    """
    Renames .parquet files in the input_dir such that:
    - "train_val" is replaced with "val"
    - "test_val" is replaced with "val_ood"
    """
    parquet_files = list(Path(input_dir).rglob("*.parquet"))
    print(f"Found {len(parquet_files)} .parquet files to rename.")

    for file_path in parquet_files:
        new_name = file_path.name.replace("test", "test_ood")
        new_path = file_path.with_name(new_name)

        if new_name != file_path.name:
            file_path.rename(new_path)
            print(f"Renamed: {file_path} -> {new_path}")

# Call the function with the target directory
rename_parquet_files("before-arc-parquet/")

Found 133 .parquet files to rename.
Renamed: before-arc-parquet/compositionality/exp_setting_1/experiment_3/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_3/test_ood.parquet
Renamed: before-arc-parquet/compositionality/exp_setting_1/experiment_4/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_4/test_ood.parquet
Renamed: before-arc-parquet/compositionality/exp_setting_1/experiment_2/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_2/test_ood.parquet
Renamed: before-arc-parquet/compositionality/exp_setting_1/experiment_1/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_1/test_ood.parquet
Renamed: before-arc-parquet/compositionality/exp_setting_4/experiment_1/test.parquet -> before-arc-parquet/compositionality/exp_setting_4/experiment_1/test_ood.parquet
Renamed: before-arc-parquet/compositionality/exp_setting_2/experiment_3/test.parquet -> before-arc-parquet/compositionality/

In [None]:
import os
from pathlib import Path
import shutil

def move_parquet_files(source_dir, target_dir, file_name):
    """
    Moves all files with the specified file_name from source_dir to target_dir,
    maintaining the directory structure.
    """
    source_files = list(Path(source_dir).rglob(f"**/{file_name}"))
    print(f"Found {len(source_files)} files named '{file_name}' to move.")

    for file_path in source_files:
        # Calculate the relative path to maintain directory structure
        rel_path = file_path.relative_to(source_dir)
        target_path = Path(target_dir) / rel_path

        # Create the parent directories in the target directory if they don't exist
        os.makedirs(target_path.parent, exist_ok=True)

        # Move the file
        shutil.move(str(file_path), str(target_path))
        print(f"Moved: {file_path} -> {target_path}")

# Move files for the generalization folder
move_parquet_files(
    "before_arc_datasets/generalization",
    "before-arc-parquet/generalization",
    "train_iid.parquet"
)

# Move files for the compositionality folder
move_parquet_files(
    "before_arc_datasets/compositionality",
    "before-arc-parquet/compositionality",
    "train_iid.parquet"
)

In [10]:
import os
from pathlib import Path

def delete_non_train_parquet_files(directory):
    """
    Deletes all .parquet files in the specified directory (and its subdirectories)
    that do not contain the string 'train' in their filename.
    """
    parquet_files = list(Path(directory).rglob("*.parquet"))
    print(f"Found {len(parquet_files)} .parquet files to check.")

    for file_path in parquet_files:
        print(file_path)
        if "train_val" in file_path.name:
            file_path.unlink()  # Delete the file
            print(f"Deleted: {file_path}")
        elif "test_val" in file_path.name:
            file_path.unlink()
            print(f"Deleted: {file_path}")

# Call the function with the target directory
delete_non_train_parquet_files("before-arc-parquet/")

Found 165 .parquet files to check.
before-arc-parquet/compositionality/exp_setting_1/experiment_3/test.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_3/val_ood.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_3/val.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_3/train.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_3/test_ood.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_4/test.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_4/val_ood.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_4/val.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_4/train.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_4/test_ood.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_2/test.parquet
before-arc-parquet/compositionality/exp_setting_1/experiment_2/val_ood.parquet
before-arc-parquet/compositionality/

In [4]:
import os
from pathlib import Path
import shutil

def move_all_parquet_files(source_dir, target_dir):
    """
    Moves all .parquet files from source_dir to the corresponding subdirectories in target_dir.
    Assumes the subdirectories already exist in the target directory.
    """
    parquet_files = list(Path(source_dir).rglob("*.parquet"))
    print(f"Found {len(parquet_files)} .parquet files to move.")

    for file_path in parquet_files:
        # Calculate the relative path to maintain directory structure
        rel_path = file_path.relative_to(source_dir)
        target_path = Path(target_dir) / rel_path

        # Move the file
        shutil.move(str(file_path), str(target_path))
        print(f"Moved: {file_path} -> {target_path}")

# Call the function with the source and target directories
move_all_parquet_files("before_arc_datasets", "before-arc-parquet")


Found 132 .parquet files to move.
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_3/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_3/test.parquet
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_3/val_ood.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_3/val_ood.parquet
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_3/val.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_3/val.parquet
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_3/test_ood.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_3/test_ood.parquet
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_4/test.parquet -> before-arc-parquet/compositionality/exp_setting_1/experiment_4/test.parquet
Moved: before_arc_datasets/compositionality/exp_setting_1/experiment_4/val_ood.parquet -> before-arc-parquet/compositionality/exp_setting_1

In [6]:
import os
from pathlib import Path

def delete_json_files(directory):
    """
    Deletes all .json files in the specified directory (and its subdirectories).
    """
    json_files = list(Path(directory).rglob("*.json"))
    print(f"Found {len(json_files)} .json files to delete.")

    for file_path in json_files:
        file_path.unlink()  # Delete the file
        print(f"Deleted: {file_path}")

# Call the function with the target directory
delete_json_files("before-arc-parquet/")

Found 4 .json files to delete.
Deleted: before-arc-parquet/compositionality/exp_setting_4/experiment_1/train.json
Deleted: before-arc-parquet/compositionality/exp_setting_4/experiment_1/test_val.json
Deleted: before-arc-parquet/compositionality/exp_setting_4/experiment_1/train_val.json
Deleted: before-arc-parquet/compositionality/exp_setting_4/experiment_1/test.json
