# Zenodo Automation

-   Make sure to run the code cell below before testing out any functionalities.
-   To test functionalities, look at cell 3 and onwards

In [6]:
import argparse
import requests
import json
import os
import sys
from pathlib import Path
from typing import List, Optional, Dict, Any
import time
from dotenv import load_dotenv

#!/usr/bin/env python3
"""
Zenodo API Script

A script to search Zenodo based on keywords and upload/download files.
Usage: 
    - Search: python zenodo.py search keyword1 [keyword2 keyword3 ...]
    - Download: python zenodo.py download record_id [output_dir]
    - Download by Keywords: python zenodo.py download-via-keywords keyword1 [keyword2 ...] [output_dir]
    - Upload: python zenodo.py upload filename [--title "Title"] [--description "Description"]

Example: 
    python zenodo.py search climate
    python zenodo.py search "machine learning" biology
    python zenodo.py download 123456 ./downloads
    python zenodo.py download-via-keywords climate data ./climate-downloads
    python zenodo.py upload dataset.zip --title "My Dataset"
"""



# Load environment variables from .env file
load_dotenv()


def search_zenodo(keywords: List[str], page: int = 1, page_size: int = 20, sort: str = "bestmatch", access_token: Optional[str] = None) -> dict:
        """
        Search Zenodo using keywords.
        
        Args:
                keywords: List of keywords to search for
                page: Page number to retrieve
                page_size: Number of results per page
                sort: Sorting method (bestmatch, mostrecent)
                access_token: Optional Zenodo API access token
                
        Returns:
                Dictionary containing the search results
        """
        # Zenodo API endpoint for searching
        zenodo_api_url = "https://zenodo.org/api/records"
        
        # Combine multiple keywords with AND operators
        if len(keywords) > 1:
                query = " AND ".join(keywords)
        else:
                query = keywords[0]
        
        # Set up the parameters for the API request
        params = {
                "q": query,
                "size": page_size,
                "page": page,
                "sort": sort
        }
        
        # Set up headers with access token if provided
        headers = {}
        if access_token:
                headers["Authorization"] = f"Bearer {access_token}"
        
        # Make the API request
        response = requests.get(zenodo_api_url, params=params, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
                return response.json()
        else:
                raise Exception(f"API request failed with status code {response.status_code}: {response.text}")


def display_results(results: dict) -> None:
        """
        Display the search results in a readable format.
        
        Args:
                results: Dictionary containing the search results
        """
        hits = results.get("hits", {}).get("hits", [])
        total = results.get("hits", {}).get("total", 0)
        
        if isinstance(total, dict):  # Handle newer Zenodo API format
                total = total.get("value", 0)
        
        print(f"\nFound {total} results\n")
        print("-" * 80)
        
        if not hits:
                print("No results found for your search query.")
                return
        
        for i, hit in enumerate(hits, 1):
                metadata = hit.get("metadata", {})
                
                title = metadata.get("title", "No title")
                creators = metadata.get("creators", [])
                creator_names = ", ".join([creator.get("name", "Unknown") for creator in creators])
                publication_date = metadata.get("publication_date", "Unknown date")
                description = metadata.get("description", "No description")
                
                # Truncate long descriptions
                if len(description) > 200:
                        description = description[:200] + "..."
                
                # Get DOI and URL
                doi = metadata.get("doi", "No DOI")
                record_url = f"https://zenodo.org/record/{hit.get('id', '')}"
                
                print(f"{i}. {title}")
                print(f"   Authors: {creator_names}")
                print(f"   Published: {publication_date}")
                print(f"   DOI: {doi}")
                print(f"   URL: {record_url}")
                
                # Print keywords if available
                if "keywords" in metadata and metadata["keywords"]:
                        print(f"   Keywords: {', '.join(metadata['keywords'])}")
                        
                print(f"   Description: {description}")
                print("-" * 80)


def save_results(results: dict, filename: str = "zenodo_results.json") -> None:
        """
        Save the search results to a JSON file.
        
        Args:
                results: Dictionary containing the search results
                filename: Name of the file to save the results to
        """
        with open(filename, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
        
        print(f"Results saved to {filename}")


def download_zenodo_record(record_id: str, output_dir: Optional[str] = None, access_token: Optional[str] = None) -> None:
        """
        Download all files associated with a Zenodo record.
        
        Args:
                record_id: The ID of the record to download files from
                output_dir: Directory to save files to (default: current directory)
                access_token: Zenodo API access token
        """
        # Set up the output directory
        if output_dir:
                output_path = Path(output_dir)
                output_path.mkdir(parents=True, exist_ok=True)
        else:
                output_path = Path.cwd()
        
        # Set up headers with access token if provided
        headers = {}
        if access_token:
                headers["Authorization"] = f"Bearer {access_token}"
        
        # Get record metadata
        api_url = f"https://zenodo.org/api/records/{record_id}"
        response = requests.get(api_url, headers=headers)
        
        if response.status_code != 200:
                raise Exception(f"Failed to get record {record_id}: {response.status_code} - {response.text}")
        
        record_data = response.json()
        title = record_data.get("metadata", {}).get("title", "Unknown Title")
        print(f"Downloading files for record: {title}")
        
        # Extract file information
        files = record_data.get("files", [])
        if not files:
                print("No files found in this record.")
                return
        
        print(f"Found {len(files)} file(s).")
        
        # Download each file
        for file_info in files:
                file_url = file_info.get("links", {}).get("self", "")
                filename = file_info.get("key", "unknown_file")
                size = file_info.get("size", 0)
                
                # Format the file size
                size_str = f"{size / 1024:.1f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.1f} MB"
                
                print(f"Downloading: {filename} ({size_str})")
                
                # Download the file with the same headers
                file_response = requests.get(file_url, headers=headers, stream=True)
                if file_response.status_code != 200:
                        print(f"Failed to download {filename}: {file_response.status_code}")
                        continue
                
                # Sanitize filename to remove path separators
                safe_filename = filename.replace('/', '_').replace('\\', '_')
                
                # Save the file
                output_file = output_path / safe_filename
                with open(output_file, 'wb') as f:
                        for chunk in file_response.iter_content(chunk_size=8192):
                                f.write(chunk)
                
                print(f"Saved to: {output_file}")

def download_via_keywords(
    keywords: List[str], 
    output_dir: Optional[str] = None, 
    access_token: Optional[str] = None,
    max_records: int = 10,
    page_size: int = 20,
    sort: str = "bestmatch"
) -> None:
    """
    Search for records matching keywords and download all files from those records.
    
    Args:
        keywords: List of keywords to search for
        output_dir: Directory to save files to (default: current directory)
        access_token: Zenodo API access token
        max_records: Maximum number of records to download
        page_size: Number of results per page
        sort: Sorting method (bestmatch, mostrecent)
    """
    # Set up the output directory
    if output_dir:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
    else:
        output_path = Path.cwd()
    
    print(f"Searching Zenodo for: {' AND '.join(keywords)}")
    
    # Perform the search
    results = search_zenodo(keywords, 1, page_size, sort, access_token)
    
    hits = results.get("hits", {}).get("hits", [])
    total = results.get("hits", {}).get("total", 0)
    
    if isinstance(total, dict):  # Handle newer Zenodo API format
        total = total.get("value", 0)
    
    if not hits:
        print("No results found for your search query.")
        return
    
    print(f"\nFound {total} results. Will download files from up to {max_records} records.\n")
    
    # Limit the number of records to download
    records_to_download = min(len(hits), max_records)
    
    # Create a subdirectory for each record
    for i, hit in enumerate(hits[:records_to_download], 1):
        record_id = hit.get("id", "")
        if not record_id:
            continue
        
        # Create a directory for this record
        record_title = hit.get("metadata", {}).get("title", f"record_{record_id}")
        safe_title = "".join(c if c.isalnum() or c in "._- " else "_" for c in record_title)
        safe_title = safe_title[:50]  # Limit directory name length
        
        record_dir = output_path / f"{i}_{safe_title}"
        record_dir.mkdir(exist_ok=True)
        
        # Save record metadata
        with open(record_dir / "metadata.json", "w", encoding="utf-8") as f:
            json.dump(hit, f, indent=2, ensure_ascii=False)
        
        # Download all files for this record
        try:
            print(f"\nDownloading record {i}/{records_to_download}: {record_title}")
            download_zenodo_record(record_id, str(record_dir), access_token)
        except Exception as e:
            print(f"Error downloading record {record_id}: {e}")
    
    print(f"\nDownload complete. Files saved to {output_path}")


# Searching Records

To search and display results:
-   add and place the words with your keywords that you want to search by in the list called keywords 
-   To search for records, call the function:
-   ```search_result(keywords)``` function takes in a list of keywords

-   To display the searched results, call the function:
-   ```display_results(results)``` function takes in dictionary of all the results produced from the search and displays them

-   run the cell below to test search functionality; if cell produces error, make sure to run the first cell on the top of this notebook
-   the command line equivalent ran in the terminal is:

        python zenodo.py search <keywords you want to search by>

-   Ex:

        python zenodo.py search climate data

In [9]:
keywords = ["climate", "data"]
search_results = search_zenodo(keywords)
display_results(search_results)


Found 31039 results

--------------------------------------------------------------------------------
1. DDF_test_data
   Authors: Hartick, Carl
   Published: 2024-11-05
   DOI: 10.5281/zenodo.14041129
   URL: https://zenodo.org/record/14041129
   Description: No description
--------------------------------------------------------------------------------
2. The UK Climate Predicition 2009 (UKCP09) Outputs and metadata specification - Release 1.1
   Authors: Stephens, Ag
   Published: 2014-01-02
   DOI: 10.5281/zenodo.7356959
   URL: https://zenodo.org/record/7356959
   Keywords: data and information
   Description: The UK Climate Predicition 2009 (UKCP09) Project user interface (UI) documentation. The User Interface (UI) provides the access to project outputs. These outputs must also be consistent and well-descr...
--------------------------------------------------------------------------------
3. Data-4-Climate-Action-Edinburgh/Data4ClimateActionEdinburgh_Code_etc: Data from 2022 sur

# Saving Search Results

-   ```save_results(search_results, filename="fileName")``` function takes in a dictiionary of results from the search and json filename
-   if the filename doesn't exist, a file of that name will be created

In [10]:
save_results(search_results, filename="climate_data_results.json")

Results saved to climate_data_results.json


# Downloading Records via Record IDs

-   To find the record ID, its the numbers at the end of each record URL
    -   Ex: if the url is https://zenodo.org/record/8030084 then the record ID is 8030084

-   Each record URL is displayed when a search is done and the results are displayed, as explained in the cells above
-   To download a record via record ID, call the function:
-   ```download_zenodo_record(record_id)``` where it takes in the record ID as a string for the first parameter 

Optional:
-   To store the downloaded record in a directory, pass it as a string as the second parameter in the function 
    -   Ex: 

            download_zenodo_record(record_id, output_dir=output_directory)

-   run the cell below to test search functionality; if cell produces error, make sure to run the first cell on the top of this notebook
-   the command line equivalent ran in the terminal is:

        python zenodo.py download <database id> <optional: directory you want to download to>

-   Ex: 

        python zenodo.py download 13960343 ./downloads



In [None]:
record_id = "7356959"  
output_directory = "./downloads" 

download_zenodo_record(record_id, output_dir=output_directory)

Downloading files for record: The UK Climate Predicition 2009 (UKCP09) Outputs and metadata specification - Release 1.1
Found 1 file(s).
Downloading: UKCP09_file_format_spec.pdf (450.3 KB)
Saved to: downloads/UKCP09_file_format_spec.pdf


# Downloading Records via Keywords

-   To download a record via keywords, call the function:
-   ```download_via_keywords(keywords)``` where it takes in a list of strings of keywords as the first parameter

Optional:
-   To store the downloaded record in a directory, pass it as a string as the second parameter in the function 
-   To download a max number of records, pass the max number of records you want to download as the third parameter
-   To download the top results sorted by "bestmatch" or "mostrecent", pass either as a string as the fourth parameter
    -   Ex: 
            download_via_keywords(keywords, output_dir=output_directory, max_records=5,sort="bestmatch")

-   run the cell below to test search functionality; if cell produces error, make sure to run the first cell on the top of this notebook

-   the command line equivalent ran in the terminal is:

        python zenodo.py download-via-keywords <keyword1> <keyword2> <keyword3> <...> 

-   Ex: 

        python zenodo.py download-via-keywords "machine learning" climate temperature dataset

Optional (directory, max downloads, sort):

        python zenodo.py download-via-keywords "machine learning" climate temperature dataset --max-records 5 --sort bestmatch ./climate_data




In [13]:
keywords = ["climate", "data"]
output_directory = "./downloads"
download_via_keywords(keywords, output_dir=output_directory, max_records=2,sort="bestmatch")

Searching Zenodo for: climate AND data

Found 31040 results. Will download files from up to 2 records.


Downloading record 1/2: DDF_test_data
Downloading files for record: DDF_test_data
Found 8 file(s).
Downloading: events_2021070800-2021083123_T5.csv (0.8 KB)
Saved to: downloads/1_DDF_test_data/events_2021070800-2021083123_T5.csv
Downloading: 2021_2021_DDF.nc (19.0 KB)
Saved to: downloads/1_DDF_test_data/2021_2021_DDF.nc
Downloading: objects_08.tar (23.6 MB)
Saved to: downloads/1_DDF_test_data/objects_08.tar
Downloading: 2021_2021_KO_params.nc (23.8 KB)
Saved to: downloads/1_DDF_test_data/2021_2021_KO_params.nc
Downloading: 2021_iams.nc (18.7 KB)
Saved to: downloads/1_DDF_test_data/2021_iams.nc
Downloading: dummy_data.nc (47.6 KB)
Saved to: downloads/1_DDF_test_data/dummy_data.nc
Downloading: objects_202107080000-202108312300_T5.csv (130.8 KB)
Saved to: downloads/1_DDF_test_data/objects_202107080000-202108312300_T5.csv
Downloading: objects_07.tar (19.8 MB)
Saved to: downloads/1_DDF_t