In [1]:
import os
import sys
import json
import hashlib
import requests


In [None]:
base_url = "https://web.corral.tacc.utexas.edu/setxuifl/"
from bs4 import BeautifulSoup
import urllib.parse

def get_files_from_url(base_url, create_submanifests=True):
    files = []
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for row in soup.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) >= 4:
            filename = cols[0].find('a')
            if filename and filename.text not in ['..']:
                file_type = cols[3].text.strip()
                full_url = urllib.parse.urljoin(base_url, filename['href'])
                
                if file_type == 'Directory':
                    if create_submanifests:
                        subdir_files = get_files_from_url(full_url)
                        subdir_path = os.path.join(".", filename.text)
                        with open(os.path.join('manifests/',f"{subdir_path}-manifest.json"), "w") as f:
                            json.dump(subdir_files, f, indent=4)
                    length = 0
                else:
                    size = cols[2].text.strip()
                    if 'M' in size:
                        length = int(float(size.replace('M', '')) * 1024 * 1024)
                    elif 'K' in size:
                        length = int(float(size.replace('K', '')) * 1024)
                    else:
                        length = 0

                files.append({
                    "sha512": "placeholder_hash",
                    "filename": filename.text,
                    "url": full_url,
                    "length": length,
                    "type": file_type
                })
    return files


# Example usage
file_list = get_files_from_url(base_url)
for file in file_list:
    print(file)
with open(os.path.join("./manifests/", "manifest.json"), "w") as f:
    json.dump(file_list, f, indent=4)


{'sha512': 'placeholder_hash', 'filename': 'tropical_cyclones', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/tropical_cyclones/', 'length': 0, 'type': 'Directory'}
{'sha512': 'placeholder_hash', 'filename': 'tceq_air_monitor_obs_20240528.zip', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/tceq_air_monitor_obs_20240528.zip', 'length': 44983910, 'type': 'application/zip'}
{'sha512': 'placeholder_hash', 'filename': 'test.txt', 'url': 'https://web.corral.tacc.utexas.edu/setxuifl/test.txt', 'length': 0, 'type': 'text/plain'}


In [None]:
%%file metadata.toml
# {dset} will be replaced with the dataset name
title="Dataset - SETx-UIFL Datasets {dset}"
author="Will Mobley"
description="a series of datasets for  {dset} component"
date_created="2024-11-8"