# Example Script: Manually Creating a Single DOI and Persisting the Metadata in Gen3 (Specifically MIDRC)

## Prerequisites

Before running this example script, make sure you have the following prerequisites:

- An environment variable `DATACITE_USERNAME` set as a valid DataCite username for interacting with their API.
- An environment variable `DATACITE_PASSWORD` set as a valid DataCite password for interacting with their API.

## Overview:


1. Setting up the necessary classes for interacting with Gen3 & Datacite.
2. Getting the DOI metadata (ideally from some external source like a file or another API, but here we've hard-coded it).
3. Creating/Minting the DOI in DataCite.
4. Persisting the DOI metadata into a Gen3 Discovery record in the metadata service.


In [None]:
import pandas as pd
import sys, os
from requests.auth import HTTPBasicAuth

## import Gen3 SDK
import gen3
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.index import Gen3Index
from gen3.query import Gen3Query
from cdislogging import get_logger
from gen3.doi import (
  DataCite,
  DigitalObjectIdentifier,
  DigitalObjectIdentifierCreator,
  DigitalObjectIdentifierTitle,
)

In [None]:
username = 'your_username'  # Replace 'your_username' with the actual username
git_dir = f'/Users/{username}/Documents/GitHub'
sdk_dir = '/cgmeyer/gen3sdk-python'
sys.path.insert(1, f'{git_dir}{sdk_dir}')
from expansion.expansion import Gen3Expansion
%run f'/Users/{username}/Documents/GitHub/cgmeyer/gen3sdk-python/expansion/expansion.py'

In [None]:
## Don't usually work in production for this; mint the DOI in staging MDS and release to prod on monthly cycle (but the commons URLs will point to prod!)
# api = 'https://data.midrc.org'
# cred = '/Users/christopher/Downloads/midrc-credentials.json'
# auth = Gen3Auth(api, refresh_file=cred)
# sub = Gen3Submission(api, auth)
# query = Gen3Query(auth)
# index = Gen3Index(auth)
# exp = Gen3Expansion(api,auth,sub)
# exp.get_project_ids()

## Initialize Gen3 SDK for MIDRC Staging
sapi = 'https://staging.midrc.org'
scred = f'/Users/{username}/Downloads/midrc-staging-credentials.json'
sauth = Gen3Auth(sapi, refresh_file=scred)
ssub = Gen3Submission(sapi, sauth)
sindex = Gen3Index(sauth)
squery = Gen3Query(sauth)

In [None]:
### download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python
doi_dir = f"/Users/{username}/Documents/Notes/MIDRC/DOI" #set your working dir
os.chdir(doi_dir)
os.system("wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py -O expansion.py")
%run expansion.py # Some additional functions in Gen3Expansion class
sexp = Gen3Expansion(sapi,sauth,ssub)
sexp.get_project_ids()


In [None]:
# Get existing metadata from MDS
# https://ibdgc.datacommons.io/mds/metadata?_guid_type=discovery_metadata&limit=9999

mds = sexp.get_mds(limit=9999,args="_guid_type=discovery_metadata")


In [None]:
# Copy of the test MDS record for test DOI:
 
mguid = "P67C-YW54"
m = {}
m[mguid] = mds[mguid]

"""
{'P67C-YW54': {'_guid_type': 'discovery_metadata',
  'gen3_discovery': {'authz': '/programs/Open',
   'commons': 'MIDRC',
   'doi_url': 'https://staging.midrc.org/discovery/P67C-YW54/',
   'abstract': 'Data used in the development and evaluation of AI models play a significant role in the robustness and generalizability of the model performance. To enable independent evaluation of many algorithms using data from MIDRC, a sequestered commons was created using a developed method of multi-dimensional stratified sampling. The sequestered commons is restricted from public download access but will be used for the formal evaluation of finalized algorithms. To draw from the commons an independent sample for performance evaluation, a method of task-based distribution sampling by optimized quota sampling was also developed.',
   'data_url': 'https://github.com/MIDRC/Stratified_Sampling/',
   'doi_dates': [],
   'doi_sizes': [],
   'doi_title': 'MIDRC Stratified Sampling Manuscript Cohort',
   'full_name': 'MIDRC Stratified Sampling Manuscript Cohort',
   '_unique_id': 'P67C-YW54',
   'doi_titles': 'MIDRC Stratified Sampling Manuscript Cohort',
   'short_name': 'MIDRC10.82666/P67C-YW54',
   'cases_count': 5000,
   'doi_contact': 'https://www.midrc.org/midrc-contact',
   'doi_formats': [],
   'doi_version': '1',
   'files_count': '1',
   'doi_citation': 'Baughan, Natalie; Whitney, Heather; Myers, Kyle J.; Giger, Maryellen L. (2023). MIDRC Stratified Sampling Manuscript Cohort (Version 1) [Data set]. Medical Imaging and Data Resource Center (MIDRC). https://doi.org/10.82666/P67C-YW54',
   'doi_creators': 'Natalie Baughan ORCID: https://orcid.org/0000-0002-3219-9291, Heather M. Whitney ORCID: https://orcid.org/0000-0002-7258-1102, Kyle J. Myers ORCID: https://orcid.org/0000-0001-7394-4932, Maryellen L. Giger ORCID: https://orcid.org/0000-0001-5482-9728',
   'doi_language': None,
   'doi_subjects': [],
   'doi_publisher': 'Medical Imaging and Data Resource Center (MIDRC)',
   'doi_identifier': '10.82666/P67C-YW54',
   'doi_rightsList': [],
   'doi_description': '',
   'doi_identifiers': '',
   'data_description': 'Spreadsheet of metadata for a cohort of 5000 MIDRC subjects.',
   'doi_contributors': '',
   'doi_geoLocations': [],
   'doi_is_available': 'Yes',
   'doi_resource_type': 'Dataset',
   'data_download_links': [{'guid': 'dg.MD1R/1cad1ea9-3749-4484-bbea-1787f6d4006e',
     'title': 'MIDRC Stratified Sampling Manuscript Cohort',
     'description': 'Spreadsheet of metadata for the DOI cohort.'}],
   'doi_resolvable_link': 'https://doi.org/10.82666/P67C-YW54',
   'doi_publication_year': 2023,
   'doi_access_information': "Use the 'Download File' button below in the 'Data Download Links' section to access the spreadsheet of data for this cohort.",
   'doi_funding_references': 'National Institute of Biomedical Imaging and Bioengineering',
   'doi_relatedIdentifiers': [],
   'doi_version_information': 'This is version 1 of this Dataset.',
   'doi_alternateIdentifiers': '',
   'doi_access_information_link': 'https://staging.midrc.org/discovery/P67C-YW54/'}}}
"""


In [None]:
### Create DOI following SDK instructions using production DataCite site
# https://github.com/uc-cdis/gen3sdk-python/blob/master/docs/howto/discoveryMetadataTools.md#dois-in-gen3-discovery-metadata-and-page-for-visualizing-public-doi-metadata
##############################################################################

# bin bash command
# SET VARIABLES
DATACITE_USERNAME= # in keeper
DATACITE_PASSWORD= # in keeper

# This prefix is provided by DataCite and is specific to MIDRC
#PREFIX = "10.82666" # this is the test DataCite prefix for MIDRC
PREFIX = "10.60701" # this is the production DataCite prefix for MIDRC
gen3_metadata_guid = "P67C-YW55" # I just made this up. No idea how I'm supposed to generate this?

PUBLISHER = "Medical Imaging and Data Resource Center (MIDRC)"
COMMONS_DISCOVERY_PAGE = "https://data.midrc.org/discovery"
DOI_DISCLAIMER = ""
DOI_ACCESS_INFORMATION = "Use the 'Download File' button below in the 'Data Download Links' section to access the spreadsheet of data for this cohort."
DOI_CONTACT = "https://www.midrc.org/midrc-contact"

creators = [
  DigitalObjectIdentifierCreator(
    name="Baughan, Natalie",
    name_type=DigitalObjectIdentifierCreator.NAME_TYPE_PERSON,
  ).as_dict(),
  DigitalObjectIdentifierCreator(
    name="Whitney, Heather",
    name_type=DigitalObjectIdentifierCreator.NAME_TYPE_PERSON,
  ).as_dict(),
  DigitalObjectIdentifierCreator(
    name="Myers, Kyle J.",
    name_type=DigitalObjectIdentifierCreator.NAME_TYPE_PERSON,
  ).as_dict(),
  DigitalObjectIdentifierCreator(
    name="Giger, Maryellen L.",
    name_type=DigitalObjectIdentifierCreator.NAME_TYPE_PERSON,
  ).as_dict()
]

doi_title = 'MIDRC Stratified Sampling Manuscript Cohort'
publisher = "Medical Imaging and Data Resource Center (MIDRC)"
publication_year = 2023
doi_type = "Dataset"
version = 1


In [None]:
def mint_doi_prod(self,
  DATACITE_USERNAME,
  DATACITE_PASSWORD,
  PREFIX,
  gen3_metadata_guid,
  PUBLISHER,
  COMMONS_DISCOVERY_PAGE,
  DOI_CONTACT,
  creators,
  doi_title,
  publisher,
  publication_year,
  doi_type,
  version,
  update = False,
  publish_dois=True, 
  use_prod=True,
  DOI_ACCESS_INFORMATION = "Use the 'Download File' button below in the 'Data Download Links' section to access the spreadsheet of data for this cohort.",
  DOI_DISCLAIMER = "" # This is for the DOI tombstone, so can leave blank
  ):

  logging = get_logger("__name__", log_level="info")
  identifier = "{}/{}".format(PREFIX,gen3_metadata_guid)
  titles = [DigitalObjectIdentifierTitle(doi_title).as_dict()]

  datacite = DataCite(
    use_prod=use_prod,
    auth_provider=HTTPBasicAuth(DATACITE_USERNAME,DATACITE_PASSWORD))


  doi_metadata = {
    "identifier": identifier,
    "creators": creators,
    "titles": titles,
    "publisher": publisher,
    "publication_year": publication_year,
    "doi_type": doi_type,
    "version": version,
  }

  url = COMMONS_DISCOVERY_PAGE.rstrip("/") + f"/{gen3_metadata_guid}/"
  DOI_ACCESS_INFORMATION_LINK = url

  doi = DigitalObjectIdentifier(url=url, **doi_metadata)

  if publish_dois:
    logging.info(f"Publishing DOI `{identifier}`...")
    doi.event = "publish"

  if update:
    response = datacite.update_doi(doi) # UPDATE EXIST DOIs
  else:
    response = datacite.create_doi(doi) # CREATE NEW DOIs
  
  
  doi = DigitalObjectIdentifier.from_datacite_create_doi_response(response)
