In [23]:
# Get hugging face file names

from huggingface_hub import HfApi
import os
import requests

repo_id = "Open-Orca/FLAN"
subfolder = "flan_fsnoopt_data"
local_base_path = "datasets"

api = HfApi()

def get_subfolder_files(repo_id, subfolder):
    repo_files = api.list_repo_files(repo_id, repo_type="dataset")
    return [file for file in repo_files if file.startswith(f"{subfolder}/")]

subfolder_files = get_subfolder_files(repo_id, subfolder)
len(subfolder_files)

165

In [24]:
# Evenly spaced indices for download
import numpy as np

indices = np.linspace(0, len(subfolder_files)-1, 10, dtype=int)

subfolder_file_prefix = subfolder_files[0].split(".")[0]
selected_subfolder_files = [f"{subfolder_file_prefix}.{i}.parquet" for i in indices] 
selected_subfolder_files

['flan_fsnoopt_data/part.0.parquet',
 'flan_fsnoopt_data/part.18.parquet',
 'flan_fsnoopt_data/part.36.parquet',
 'flan_fsnoopt_data/part.54.parquet',
 'flan_fsnoopt_data/part.72.parquet',
 'flan_fsnoopt_data/part.91.parquet',
 'flan_fsnoopt_data/part.109.parquet',
 'flan_fsnoopt_data/part.127.parquet',
 'flan_fsnoopt_data/part.145.parquet',
 'flan_fsnoopt_data/part.164.parquet']

In [25]:
# Download paraquet files

import concurrent.futures

def download_file(api, repo_id, file, local_base_path):
    api.hf_hub_download(repo_id=repo_id, repo_type="dataset", 
                                         filename=file, local_dir=local_base_path)

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(download_file, api, repo_id, file, local_base_path) 
                   for file in selected_subfolder_files]
        
        concurrent.futures.wait(futures)

part.54.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.18.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.0.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.36.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.72.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.91.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.109.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.127.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.145.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

part.164.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

In [26]:
# Select the dataset
dataset_name = "flan_fsnoopt_data"
folder_path = os.path.join(local_base_path, dataset_name)
stats_folder_name = "stats_by_dataset"
dataset_prefix = dataset_name.split("_")[0]

In [27]:
# Check all files are downloaded
def get_parquet_files_by_folder(folder_path):
    items = os.listdir(folder_path)
    return [item for item in items if item.endswith('.parquet') and os.path.isfile(os.path.join(folder_path, item))]

parquet_files = get_parquet_files_by_folder(folder_path)
print(f"Total files '{folder_path}': {len(parquet_files)}")


Total files 'datasets/flan_fsnoopt_data': 10


In [28]:
# Save Stats
import os
import pandas as pd
import json
import numpy as np


def append_columns_to_dict(file_path, columns):
    df = pd.read_parquet(file_path, columns=columns)
    counts_dict = {}
    
    for column in columns:
        counts = df[column].value_counts().to_dict()
        counts_dict[column] = counts

    return counts_dict

def save_dict_to_json(data_dict, json_file_path):
    os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
    with open(json_file_path, 'w') as json_file:
        json.dump(data_dict, json_file, indent=4)

columns = ["_task_name", "_template_type", "_task_source", "_template_idx"]

subfolder_file_prefix = parquet_files[0].split(".")[0]

for i in indices:
    file = f"{subfolder_file_prefix}.{i}.parquet"
    print(file)
    data_dict = append_columns_to_dict(f"datasets/flan_fsopt_data/{file}", columns)
    json_file_path = f"{stats_folder_name}/{dataset_prefix}/{dataset_name}-{i}.json"
    save_dict_to_json(data_dict, json_file_path)

part.0.parquet
part.18.parquet
part.36.parquet
part.54.parquet


part.72.parquet
part.91.parquet
part.109.parquet
part.127.parquet
part.145.parquet
part.164.parquet


In [29]:
# Generate Task Distribution Sheet
import os
import json
import csv

distribution_folder = "task_distributions"

os.makedirs(distribution_folder, exist_ok=True)
distribution_file_path = f"{distribution_folder}/{dataset_prefix}.csv"


def get_json_files_by_folder(folder_path):
    items = os.listdir(folder_path)
    return [item for item in items if item.endswith('.json') and os.path.isfile(os.path.join(folder_path, item))]

json_files = get_json_files_by_folder(f"{stats_folder_name}/{dataset_prefix}")

# Sort files prefix then index
json_files.sort(key=lambda x: x.split("-")[0])

tasks_data = {}
dataset_total = {}

for file_name in json_files:
    with open(f"{stats_folder_name}/{dataset_prefix}/{file_name}") as json_file:
        data_dict = json.load(json_file)

        dataset_total[file_name] = sum(data_dict["_task_name"].values())

        for task, count in data_dict["_task_name"].items():
            if task not in tasks_data:
                tasks_data[task] = {}
            tasks_data[task][file_name] = count


header = ["Task Name"] + [file_name.split(".")[0] for file_name in json_files]

with open(distribution_file_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for task, counts in tasks_data.items():
        total_count = sum(counts.values())
        row = [task] + [counts.get(file_name, 0) for file_name in json_files]
        writer.writerow(row)

    writer.writerow(["Total"] + [dataset_total[file_name] for file_name in json_files])
    

In [None]:
# Generate Diffs

import difflib

def compare_files(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        diff = difflib.unified_diff(
            f1.readlines(),
            f2.readlines(),
            fromfile=file1,
            tofile=file2,
        )
        return list(diff)

def compare_multiple_files(file_list):
    comparisons = {}
    for i in range(len(file_list)):
        for j in range(i + 1, len(file_list)):
            file1 = file_list[i]
            file2 = file_list[j]
            diff = compare_files(file1, file2)
            comparisons[f"{file1} vs {file2}"] = diff
    return comparisons

def save_diffs_to_html(diffs, output_file):
    with open(output_file, 'w') as f:
        f.write("<html><head><title>File Comparisons</title></head><body>")
        f.write("<h1>File Comparisons</h1>")
        for comparison, diff in diffs.items():
            f.write(f"<h2>Comparing {comparison}</h2>")
            f.write("<pre>")
            for line in diff:
                if line.startswith('+'):
                    f.write(f'<span style="color: green;">{line}</span>')
                elif line.startswith('-'):
                    f.write(f'<span style="color: red;">{line}</span>')
                elif line.startswith('^'):
                    f.write(f'<span style="color: blue;">{line}</span>')
                else:
                    f.write(line)
            f.write("</pre><hr>")
        f.write("</body></html>")

file_list = [f"{stats_folder}/{file}" for file in os.listdir(stats_folder_name) if file.endswith('.json')]
diffs = compare_multiple_files(file_list)
output_file = f"comparison_output-{dataset_name}.html"
save_diffs_to_html(diffs, output_file)