#### From
follow https://developer.nvidia.com/docs/nemo-microservices/evaluation/source/getting-started.html


#### Requirements
- created the `dataset_output` folder
- connected to VPN

In [None]:
import requests
import huggingface_hub as hh
import os
import json
import glob
import pandas as pd

In [None]:
# DS_URL = "https://datastore.stg.llm.ngc.nvidia.com"
# DS_URL = "http://nemo-datastore:8000" 

# run: kubectl port-forward svc/nemo-datastore 8000 -n nemo-evaluation
DS_URL = "http://localhost:8000"   # internally on cluster: "http://nemo-datastore:8000" 
# TODO: kubectl command here

creata_dataset_endpoint = f"{DS_URL}/v1/datasets"
get_dataset_endpoint = f"{DS_URL}/v1/datasets/"
get_datasets_endpoints = f"{DS_URL}/v1/datasets"
HEADERS = { "Accept": "application/json" }

In [None]:
try:
    requests.get(f"{DS_URL}/health", headers=HEADERS).raise_for_status()
except Exception as e:
    print("Error: please ensure VPC is connectedl")
    # run: `kubectl port-forward svc/nemo-datastore 8000 -n nemo-evaluation` after setting up AWS connection

In [6]:
#repo_name = "<your-repo-name>"
repo_name = "tsalim_simple_july_31"
params = {
    "name": repo_name,
    "description": "Description of your dataset",
}

assert repo_name != "<your-repo-name>", "Please set your repo name in the script"

In [7]:
response = requests.post(creata_dataset_endpoint, json=params).json()
DATASET_ID = response['id']
DATASET_ID

'dataset-NmCByuHrrkNZdiYMe5Gjsa'

In [30]:
resp = requests.get(get_dataset_endpoint + DATASET_ID)
resp.content

b'{"created_at":"2024-07-31T18:18:26.328346","updated_at":"2024-07-31T18:18:26.328354","name":"tsalim_simple_july_31","description":"Description of your dataset","id":"dataset-NmCByuHrrkNZdiYMe5Gjsa","files":[{"path":"correct_answer.jsonl","size":1285,"sha":"28604de68155b12a7346fb8e878bc7e684dfbecc962493d2ae737d2dc5d8017a"},{"path":"judge_prompt_parameters/correct_answer.jsonl","size":1285,"sha":"28604de68155b12a7346fb8e878bc7e684dfbecc962493d2ae737d2dc5d8017a"},{"path":"judge_prompts.jsonl","size":902,"sha":"cc866bd026ea8a7824aa61d0ec1a2a3ff62f482629fa1d558a57b6a5d6eae522"},{"path":"question.jsonl","size":466,"sha":"b1e7d1881b77930393c5acf6bfd6905c9cce3c70b8aebcb773281630df3ed375"},{"path":"reference_answer/reference.jsonl","size":1839,"sha":"f27fabeb46fffd9398a7df2b0d12e8dcac8a008e4423c681fc3e3cb5719c9629"}]}'

In [9]:
assert os.getcwd().endswith("user-guide"), "Please run this script from the root of your repo"
folder_to_upload = "dataset_output"
assert os.path.exists(folder_to_upload), f"Please go through the dataset folder creation first (see *.ipynb)"

In [10]:
files = glob.glob(f"{folder_to_upload}/**/*", recursive=True)
files_upload_pairs = [(f[len(folder_to_upload)+1:],f) for f in files if os.path.isfile(f)]

In [11]:
len(files), DS_URL, repo_name

(7, 'http://localhost:8000', 'tsalim_simple_july_31')

In [12]:
TOKEN = "token"  # use the default value of "token".
# The identifier for the repository where the dataset will be uploaded.
repo_id = f"nvidia/{repo_name}"
# Create the Hugging Face API client with the upload token and the Data Store endpoint URL.
api = hh.HfApi(endpoint=DS_URL, token=TOKEN)

# for fn,fp in files_upload_pairs:
for fn,fp in files_upload_pairs:
    # Upload the dataset file to the Data Store.
    upload_url = api.upload_file(
        path_or_fileobj=fp,
        path_in_repo=fn,
        repo_id=repo_id,
        repo_type="dataset"
    )
    print(f"Uploaded File to {upload_url}")

correct_answer.jsonl:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Uploaded File to http://localhost:8000/datasets/nvidia/tsalim_simple_july_31/blob/main/correct_answer.jsonl


question.jsonl:   0%|          | 0.00/466 [00:00<?, ?B/s]

Uploaded File to http://localhost:8000/datasets/nvidia/tsalim_simple_july_31/blob/main/question.jsonl


judge_prompts.jsonl:   0%|          | 0.00/902 [00:00<?, ?B/s]

Uploaded File to http://localhost:8000/datasets/nvidia/tsalim_simple_july_31/blob/main/judge_prompts.jsonl
Uploaded File to http://localhost:8000/datasets/nvidia/tsalim_simple_july_31/blob/main/judge_prompt_parameters/correct_answer.jsonl


reference.jsonl:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Uploaded File to http://localhost:8000/datasets/nvidia/tsalim_simple_july_31/blob/main/reference_answer/reference.jsonl


### Others

In [26]:
# get all datasets lists
resp = requests.get(get_datasets_endpoints + "?page_size=20&page=1")
pd.DataFrame(json.loads(resp.content)['datasets'])

Unnamed: 0,created_at,updated_at,name,description,id
0,2024-07-31T18:18:26.328346,2024-07-31T18:18:26.328354,tsalim_simple_july_31,Description of your dataset,dataset-NmCByuHrrkNZdiYMe5Gjsa


## References

In [None]:
# datastore swagger:
# https://datastore.stg.llm.ngc.nvidia.com/docs
# datastore api: https://dl.gitlab-master-pages.nvidia.com/-/ai-services/microservices/nemo-documentation/-/jobs/84816304/artifacts/_build/docs/nemo-microservices/latest-internal/datastore/source/example-dataset.html

# eval swagger:
# https://evaluation.stg.llm.ngc.nvidia.com/docs

# doc: https://dl.gitlab-master-pages.nvidia.com/-/ai-services/microservices/nemo-documentation/-/jobs/84816304/artifacts/_build/docs/nemo-microservices/latest-internal/evaluation/source/llm_as_judge.html#using-a-custom-judge-model
# model: https://developer.nvidia.com/docs/nemo-microservices/inference/models.html

In [23]:
fp, fn, repo_id

('dataset_output/correct_answer.jsonl',
 'correct_answer.jsonl',
 'nvidia/tsalim_simple_july_10')

In [None]:
upload_url = api.upload_file(
    path_or_fileobj=fp,
    path_in_repo=fn,
    repo_id=repo_id,
    repo_type="dataset"
)