In [1]:
import os
import sys
import json
import hashlib
import requests


In [None]:
base_url = "https://web.corral.tacc.utexas.edu/setxuifl/"
from bs4 import BeautifulSoup
import urllib.parse

def get_files_from_url(base_url):
    files = []
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all table rows
    for row in soup.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) >= 4:  # Make sure row has enough columns
            filename = cols[0].find('a')
            if filename and filename.text not in ['..']:
                file_type = cols[3].text.strip()
                
                if file_type == 'Directory':
                    # Recursively process directory
                    dir_url = urllib.parse.urljoin(base_url, filename['href'])
                    subdir_files = get_files_from_url(dir_url)
                    files.extend(subdir_files)
                    
                    # Create manifest for this directory
                    dir_path ="manifests/"
                    
                    with open(os.path.join(dir_path, f"{filename.text}-manifest.json"), "w") as f:
                        json.dump(subdir_files, f, indent=4)
                else:
                    size = cols[2].text.strip()
                    if 'M' in size:
                        length = int(float(size.replace('M', '')) * 1024 * 1024)
                    elif 'K' in size:
                        length = int(float(size.replace('K', '')) * 1024)
                    else:
                        length = 0

                    full_url = urllib.parse.urljoin(base_url, filename['href'])
                    
                    files.append({
                        "sha512": "placeholder_hash",
                        "filename": filename.text,
                        "url": full_url,
                        "length": length,
                    })
    return files


# Example usage
file_list = get_files_from_url(base_url)
for file in file_list:
    print(file)
with open(os.path.join(".", "manifest.json"), "w") as f:
    json.dump(file_list, f, indent=4)


{'sha512': 'placeholder_hash', 'filename': 'tracks_NA_ACCESS-CM2_historical_r1i1p1f1_196401_201412.nc', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/tropical_cyclones/downscaled_cmip6_tracks/historical/ACCESS-CM2/tracks_NA_ACCESS-CM2_historical_r1i1p1f1_196401_201412.nc', 'length': 26633830}
{'sha512': 'placeholder_hash', 'filename': 'tracks_NA_ACCESS-ESM1-5_historical_r1i1p1f1_196401_201412.nc', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/tropical_cyclones/downscaled_cmip6_tracks/historical/ACCESS-ESM1-5/tracks_NA_ACCESS-ESM1-5_historical_r1i1p1f1_196401_201412.nc', 'length': 26633830}
{'sha512': 'placeholder_hash', 'filename': 'tracks_NA_CESM2_historical_r1i1p1f1_196401_201412.nc', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/tropical_cyclones/downscaled_cmip6_tracks/historical/CESM2/tracks_NA_CESM2_historical_r1i1p1f1_196401_201412.nc', 'length': 26633830}
{'sha512': 'placeholder_hash', 'filename': 'tracks_NA_CESM2-WACCM_historical_r1i1p1f1_196401_201412.nc', '

In [None]:
%%file metadata.toml
# {dset} will be replaced with the dataset name
title="Dataset - SETx-UIFL Datasets {dset}"
author="Will Mobley"
description="a series of datasets for  {dset} component"
date_created="2024-11-8"