In [3]:
import os
import hashlib
import uuid
import csv
import json
import yaml  
from typing import Any, Callable

def generate_uuid(path: str) -> uuid.UUID:
    with open(path, 'rb') as f:
        content = f.read()
    digest = hashlib.sha256(content).hexdigest()
    return uuid.uuid5(uuid.NAMESPACE_URL, digest)

def prepend_to_file(path: str, text: str):
    with open(path, 'r+', encoding='utf-8') as f:
        original = f.read()
        f.seek(0)
        f.write(text + original)

def insert_structured(
    path: str,
    file_uuid: uuid.UUID,
    loader: Callable[..., Any],
    dumper: Callable[..., Any],
    is_yaml: bool = False
):
    with open(path, 'r', encoding='utf-8') as f:
        data = loader(f) or {}
    if not isinstance(data, dict):
        return
    if 'uuid' in data:
        return
    
    new_data = {'uuid': str(file_uuid), **data}

    with open(path, 'w', encoding='utf-8') as f:
        if is_yaml:
            dumper(new_data, f, sort_keys=False)
        else:
            dumper(new_data, f, indent=2)

    print(f"Generated key for {path}")

def insert_comment(path: str, file_uuid: uuid.UUID, marker: str):
    with open(path, 'r', encoding='utf-8') as f:
        head = f.read().splitlines()[:5]
    if any(line.startswith(marker) for line in head):
        return
    prepend_to_file(path, f"{marker} {file_uuid}\n")
    print(f"Generated key for {path}")

def process_files(repo_path: str, csv_output_path: str):

    handlers = {
        '.json': lambda p, u: insert_structured(p, u, json.load,   json.dump,    False),
        '.yml':  lambda p, u: insert_structured(p, u, yaml.safe_load,yaml.safe_dump, True),
        '.yaml': lambda p, u: insert_structured(p, u, yaml.safe_load,yaml.safe_dump, True),
        '.adoc': lambda p, u: insert_comment   (p, u, '// uuid:'),
    }

    mapping = []
    for root, _, files in os.walk(repo_path):
        for fname in files:
            ext = os.path.splitext(fname)[1].lower()
            if ext not in handlers:
                continue
            full = os.path.join(root, fname)
            u = generate_uuid(full)
            handlers[ext](full, u)
            rel = os.path.relpath(full, repo_path)
            mapping.append((str(u), rel))

    with open(csv_output_path, 'w', newline='', encoding='utf-8') as out:
        writer = csv.writer(out)
        writer.writerow(['uuid', 'relative_path'])
        writer.writerows(mapping)

    print(f"\n✅ Processed {len(mapping)} files → {csv_output_path}")

process_files(
    repo_path='../bluexp-dataset',
    csv_output_path='../uuid_file_map.csv'
)


✅ Processed 193 files → ../uuid_file_map.csv
