# GreenDIGIT RO-Crate Publishing Workflow

This notebook demonstrates how to:

- Read an RO-Crate containing an experiment
- Extract and format metadata according to GreenDIGIT requirements
- Publish the dataset metadata to the gCat API
- Link to a manually uploaded archive (.zip) stored in D4Science Workspace

Note: File upload is **not automated** in this version - the files need to be uploaded manually.

In [None]:
"""Communicate with SoBigData API"""
import requests
from rocrate.rocrate import ROCrate

# === CONFIG ===
BEARER_TOKEN = None  # Will be prompted if not set
GCAT_PUBLISH_URL = "https://api.d4science.org/gcat/items"
PUBLIC_ZIP_URL = None
TIMEOUT = 10


def extract_gcat_metadata(crate_path: str,
                          uploaded_file_url: str) -> dict:
    """Extract gCat metadata from RO-Crate."""
    crate = ROCrate(crate_path)
    dataset = crate.dereference("./")

    name = dataset.get("name", "unnamed-dataset").lower().replace(" ", "-")
    title = dataset.get("name", "")
    notes = dataset.get("description", "")
    keywords = dataset.get("keywords", [])

    creators = crate.get_by_type("Person")
    creator = next(
        (c for c in creators if "author" in c.get("tags", [])), None
    )
    creator_name = creator.get("name", "") if creator else ""
    creator_email = (
        creator.get("email", "") if creator and "email" in creator
        else input("Provide your Mail for the Metadata: ")
    )

    tags = [{"name": kw} for kw in keywords]
    resources = [{
        "name": "RO-Crate ZIP Archive",
        "url": uploaded_file_url,
        "format": "zip"
    }]

    extras = [
        {"key": "Creation Date", "value": "2025-06-13"},
        {"key": "Creator", "value": creator_name},
        {"key": "Creator Email", "value": creator_email},
        {"key": "Creator Name PI (Principal Investigator)",
         "value": creator_name},
        {"key": "Environment OS", "value": "Linux"},
        {"key": "Environment Platform", "value": "D4Science GreenDIGIT"},
        {"key": "Experiment Dependencies", "value": "none"},
        {"key": "Experiment ID", "value": "exp-green-digit-001"},
        {"key": "GreenDIGIT Node", "value": "D4Science Pisa"},
        {"key": "Programming Language", "value": "Python"},
        {"key": "Project ID", "value": "GD-T5.2"},
        {"key": "Session reading metrics", "value": "enabled"},
        {"key": "system:type", "value": "Experiment"}
    ]

    return {
        "name": name,
        "title": title,
        "license_id": "CC-BY-4.0",
        "private": False,
        "notes": notes,
        "url": None,
        "tags": tags,
        "resources": resources,
        "extras": extras
    }


def publish_to_gcat(entry: dict, token: str) -> dict:
    """Publish metadata to gCat catalogue."""
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    resp = requests.post(
        GCAT_PUBLISH_URL, headers=headers, json=entry, timeout=TIMEOUT
    )
    if resp.status_code in (200, 201):
        return resp.json()
    raise RuntimeError(
        f"gCat publish failed: {resp.status_code}\n{resp.text}"
    )

## Step 1 - Upload your RO-Crate `.zip` to the Workspace

Before registering the dataset, you need to upload the RO-Crate `.zip` file manually to your Workspace.

1. Go to the GreenDIGIT VRE Workspace:
   -> [https://sobigdata.d4science.org/group/greendigit/workspace](https://sobigdata.d4science.org/group/greendigit/workspace)

2. Upload your `.zip` RO-Crate to a folder (e.g. "MyResults")

3. After upload:
   - Click the file
   - Choose “Share”
   - Enable public access
   - Copy the **public link** to the file

This link is required in the next step.

### Step 2 - Provide Bearer Token for Authentication & Provide Public Link

Then provide your personal Bearer Token, which is required to authenticate your publishing request.

You can retrieve the token by logging into the GreenDIGIT VRE:
https://sobigdata.d4science.org/group/greendigit

In [None]:
if not BEARER_TOKEN:
    BEARER_TOKEN = input("Provide Bearer Token from Website: ")

user_input_url = input(f"Paste public ZIP link: ")

if not PUBLIC_ZIP_URL:
    zip_url = user_input_url.strip()

## Step 3 - Specify your local RO-Crate folder path

This should point to the uploaded (which you maybe zipped before uploading) RO-Crate **on your local system**.

If you are using the default GreenDIGIT example crate, you can simply **press Enter** to use the pre-filled path.

In [None]:
# Provide public ZIP URL (fallback if left blank)
default_crate_folder = "./result_folder_examples/2025-04-25_17-04-08_154131"
user_input_path = input(f"Paste link to RO-Crate [{default_crate_folder}]: ")

crate_folder = user_input_path.strip() or default_crate_folder

## Step 4 - Extract metadata and publish to the GreenDIGIT catalogue

This step performs the following actions:

1. Reads metadata from your local RO-Crate folder
2. Builds a metadata object compatible with the gCat API
3. Publishes the dataset and metadata to the GreenDIGIT catalogue

If successful, the Item URL will be provided.

In [None]:
try:
    print("Extracting metadata from RO-Crate...")
    entry = extract_gcat_metadata(crate_folder, zip_url)
    print("Metadata extracted.\n")

    print("Publishing to GreenDIGIT catalogue (gCat)...")
    result = publish_to_gcat(entry, BEARER_TOKEN)
    print("Published successfully.\n")

    # Try to extract the published item URL from the response
    extras = result.get("extras", [])
    item_url = next(
        (e["value"] for e in extras if e.get("key") == "Item URL"), None
    )

    if item_url:
        print(f"Your dataset is now available at:\n{item_url}")

except Exception as e:
    print("n error occurred during publishing:")
    print(str(e))