<a target="_blank" href="https://colab.research.google.com/github/umanitoba-meagher-projects/public-experiments/blob/main/jupyter-notebooks/Borealis_connection_test.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Borealis connection test

- This is a test for downloading public content hosted on Borealis using python.
- The first step is to print out a list of files in the dataset - this provides the file id that is needed to download a given file from the dataset.

In [None]:
import requests
import pandas as pd

# No API token needed for public datasets!
BOREALIS_SERVER = "https://borealisdata.ca"

def get_public_dataset_info(persistent_id):
    """
    Get information about a public dataset
    """
    url = f"{BOREALIS_SERVER}/api/datasets/:persistentId/"
    params = {"persistentId": persistent_id}

    response = requests.get(url, params=params)

    if response.status_code == 200:
        dataset_info = response.json()
    else:
        print(f"Cannot access dataset: {response.status_code}")
        return None
    """
    Get a list of files in a public dataset
    """
    # Access the list of files from the dataset_info dictionary
    files_list = dataset_info['data']['latestVersion']['files']

    # Create an empty list to store file information
    file_info_list = []

    # Iterate through the files list and append file ID and filename to the list
    for file_info in files_list:
        file_id = file_info['dataFile']['id']
        filename = file_info['dataFile']['filename']
        file_info_list.append({"file_id": file_id, "filename": filename})

    return file_info_list


# Example usage
public_doi = "doi:10.5683/SP3/H3HGWF" # doi for 'Understanding Animals jupyter notebook data'
dataset_info = get_public_dataset_info(public_doi)
print(dataset_info)

## download a file

- The next step is to download a selected file from the dataset, using the file id acquired in the previous step.

In [None]:


def download_public_file(file_id, save_path="./"):
    """
    Download a specific public file from a dataset by its file ID
    No authentication required
    """
    url = f"{BOREALIS_SERVER}/api/access/datafile/{file_id}"

    response = requests.get(url, stream=True)

    if response.status_code == 200:
        # Determine filename from headers or URL
        filename = None
        if "Content-Disposition" in response.headers:
            cd = response.headers["Content-Disposition"]
            # Try to extract filename from content disposition
            if "filename=" in cd:
                filename = cd.split("filename=")[1].strip('"')

        # Fallback to extracting from URL if header not available or malformed
        if not filename:
             filename = url.split("/")[-1]


        file_path = f"{save_path}/{filename}"

        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"✅ File downloaded to {file_path}")
        return file_path
    else:
        print(f"❌ Error {response.status_code}: File may be restricted or not found")
        return None

# Download the selected file from the Borealis dataset
file_id = 965302 # file id for deer-100.zip
download_public_file(file_id, "./")


## download the data

The next step is to download the files that you need, and unzip the files if they've been zipped.

In [None]:
import os
import zipfile

def is_zip_file(filepath):
    """
    Checks if a file is a valid zip file.
    """
    return zipfile.is_zipfile(filepath)

def unzip_file(filepath, extract_path="./"):
    """
    Unzips a zip file to a specified path and returns the name of the top-level extracted folder.
    Returns None if not a zip file or extraction fails.
    """
    if is_zip_file(filepath):
        try:
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                # Get the name of the top-level directory within the zip
                # Assumes there is a single top-level directory
                top_level_folder = None
                for file_info in zip_ref.infolist():
                    parts = file_info.filename.split('/')
                    if parts[0] and len(parts) > 1:
                        top_level_folder = parts[0]
                        break # Assuming the first entry gives the top-level folder


                zip_ref.extractall(extract_path)
                print(f"✅ Successfully unzipped {filepath} to {extract_path}")
                return top_level_folder

        except Exception as e:
            print(f"❌ Error unzipping {filepath}: {e}")
            return None
    else:
        print(f"ℹ️ {filepath} is not a valid zip file.")
        return None

# identify the file to be unzipped
downloaded_file_path = "./" + str(file_id)

if is_zip_file(downloaded_file_path):
    extracted_folder_name = unzip_file(downloaded_file_path, "./") # Extract to the current directory
    if extracted_folder_name:
        print(f"Extracted folder name: {extracted_folder_name}")
else:
    print(f"The file {downloaded_file_path} is not a zip file.")

## review zip file contents

- optionally review the contents of the zip file

In [41]:
import os

print("Contents of the unzipped directory (recursive listing):")

if os.path.exists(extracted_folder_name):
    for root, dirs, files in os.walk(extracted_folder_name):
        level = root.replace(extracted_folder_name, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f'{subindent}{f}')
else:
    print(f"The directory {extracted_folder_name} does not exist.")

Contents of the unzipped directory (recursive listing):
deer_100/
    deer_100/
        deer_139.jpg
        deer_287.jpg
        deer_030.jpg
        deer_051.jpg
        deer_036.jpg
        deer_169.jpg
        deer_033.jpg
        deer_194.jpg
        deer_170.jpg
        deer_339.jpg
        deer_185.jpg
        deer_035.jpg
        deer_342.jpg
        deer_198.jpg
        deer_072.jpg
        deer_065.jpg
        deer_462.jpg
        deer_012.jpg
        deer_283.jpg
        deer_187.jpg
        deer_471.jpg
        deer_207.jpg
        deer_324.jpg
        deer_093.jpg
        deer_270.jpg
        deer_048.jpg
        deer_372.jpg
        deer_498.jpg
        deer_055.jpg
        deer_162.jpg
        deer_205.jpg
        deer_239.jpg
        deer_257.jpg
        deer_264.jpg
        deer_016.jpg
        deer_217.jpg
        deer_047.jpg
        deer_496.jpg
        deer_241.jpg
        deer_238.jpg
        deer_289.jpg
        deer_204.jpg
        deer_301.jpg
        deer_223.