In [1]:
import pandas as pd
import requests
import os 
from bs4 import BeautifulSoup
from io import StringIO

In [2]:
# get table from html page & turn into dataframe
def htmltable_to_dataframe(url:str) -> pd.DataFrame:
    # Send request to the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract tbale as string & wrap in a StringIO object
    table_str = str(soup.find('table'))
    table_io = StringIO(table_str)

    # Read HTML table into DataFrame
    return pd.read_html(table_io)[0]


In [3]:
#get newly constructed urls from folder names
def get_urls(folder_names:list[str], base_url:str)-> list:
    return [base_url+name for name in folder_names if str(name).endswith('/')]

In [4]:
def process_directory(url: str, base_url: str) -> None:
    """Recursively processes directories to fetch and parse HTML tables into DataFrames."""
    print(f"Processing: {url}")

    # Initialize statistics for this directory
    stats = {
        'url': url,
        'num_subdirectories': 0,
        'num_files': 0,
        'subdirectory_stats': []
    }

    # Get DataFrame from the current HTML table
    df = htmltable_to_dataframe(url)
    
    # Count the number of files (entries not ending with '/')
    stats['num_files'] = df['Name'].apply(lambda x: not str(x).endswith('/')).sum()

    # Get URLs of subdirectories
    folder_urls = get_urls(list(df['Name']), base_url)
    stats['num_subdirectories'] = len(folder_urls)
    # Process each subdirectory recursively and collect their stats
    for folder_url in folder_urls:
        print(f"Found folder: {folder_url}")
        subdirectory_stats = process_directory(folder_url, folder_url)
        stats['subdirectory_stats'].append(subdirectory_stats)

    return stats


### Argo HTML file server

In [5]:
# analysed endpoint:
argo_url = "https://data-argo.ifremer.fr/"

In [6]:
# Read HTML table into DataFrame
argo_fileserver_df = htmltable_to_dataframe(argo_url)
argo_fileserver_df

Unnamed: 0.1,Unnamed: 0,Name,Last modified,Size,Description
0,,,,,
1,,ar_greylist.txt,20-Aug-2024 15:13,138K,
2,,ar_index_global_meta.txt,21-Aug-2024 02:23,962K,
3,,ar_index_global_meta.txt.gz,21-Aug-2024 02:23,153K,
4,,ar_index_global_prof.txt,21-Aug-2024 08:25,267M,
5,,ar_index_global_prof.txt.gz,21-Aug-2024 08:25,50M,
6,,ar_index_global_tech.txt,20-Aug-2024 18:29,862K,
7,,ar_index_global_tech.txt.gz,20-Aug-2024 18:29,167K,
8,,ar_index_global_traj.txt,20-Aug-2024 10:55,1.7M,
9,,ar_index_global_traj.txt.gz,20-Aug-2024 10:55,469K,


In [37]:
print(list(argo_fileserver_df['Name']))
folder_urls = get_urls(list(argo_fileserver_df['Name']), argo_url)

for url in folder_urls:
    print(url)

    df = htmltable_to_dataframe(url)

    new_urls = get_urls(list(df['Name']), url)
    print("urls: ", new_urls)

    for url2 in new_urls:
        df = htmltable_to_dataframe(url2)

        new_urls = get_urls(list(df['Name']), url2)
        print("new urls: ", new_urls)

[nan, 'ar_greylist.txt', 'ar_index_global_meta.txt', 'ar_index_global_meta.txt.gz', 'ar_index_global_prof.txt', 'ar_index_global_prof.txt.gz', 'ar_index_global_tech.txt', 'ar_index_global_tech.txt.gz', 'ar_index_global_traj.txt', 'ar_index_global_traj.txt.gz', 'ar_index_this_week_meta.txt', 'ar_index_this_week_prof.txt', 'argo_bio-profile_index.txt', 'argo_bio-profile_index.txt.gz', 'argo_bio-traj_index.txt', 'argo_bio-traj_index.txt.gz', 'argo_synthetic-profile_index.txt', 'argo_synthetic-profile_index.txt.gz', 'aux/', 'dac/', 'etc/', 'geo/', 'latest_data/', 'readme_before_using_the_data.txt', nan]
https://data-argo.ifremer.fr/aux/
urls:  ['https://data-argo.ifremer.fr/aux/aoml/', 'https://data-argo.ifremer.fr/aux/bodc/', 'https://data-argo.ifremer.fr/aux/coriolis/', 'https://data-argo.ifremer.fr/aux/csio/', 'https://data-argo.ifremer.fr/aux/incois/', 'https://data-argo.ifremer.fr/aux/meds/']
new urls:  ['https://data-argo.ifremer.fr/aux/aoml/1901608/', 'https://data-argo.ifremer.fr/a

In [7]:
# Example usage
argo_url = 'https://data-argo.ifremer.fr/'  # Base URL for the ARGO file server
start_url = argo_url   # Starting URL (replace 'directory_name' with the actual directory)

process_directory(start_url, argo_url)

Processing: https://data-argo.ifremer.fr/
Found folder: https://data-argo.ifremer.fr/aux/
Processing: https://data-argo.ifremer.fr/aux/
Found folder: https://data-argo.ifremer.fr/aux/aoml/
Processing: https://data-argo.ifremer.fr/aux/aoml/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901608/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901608/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901609/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901609/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901610/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901610/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901611/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901611/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901612/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901612/
Found folder: https://data-argo.ifremer.fr/aux/aoml/1901613/
Processing: https://data-argo.ifremer.fr/aux/aoml/1901613/
Found folder: https://data-argo.ifremer.

SSLError: HTTPSConnectionPool(host='data-argo.ifremer.fr', port=443): Max retries exceeded with url: /dac/aoml/1900432/ (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))

In [3]:
# Function to get the HTML content of a webpage
def get_html(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    return response.text

# Function to find all links on a webpage
def find_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all('a')
    return [link.get('href') for link in links if link.get('href')]

# Function to download a file
def download_file(url, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    filename = os.path.join(dest_folder, url.split('/')[-1])
    with open(filename, 'wb') as file:
        file.write(response.content)



In [5]:
# Navigating HTTP server

base_url = "https://data-argo.ifremer.fr/"

# Example usage steps
# 1. Get the main page HTML content & Find all links on the main page
main_page_html = get_html(base_url)
links = find_links(main_page_html)

# 3. Filter links of interest (e.g., links to specific data files or directories)
# For demonstration, assume we are interested in links containing 'argo'
argo_links = [link for link in links if 'argo' in link]

# 4. Navigate further or download files from filtered links
for link in argo_links:
    full_url = base_url + link
    print(f"Processing: {full_url}")
    # Check if the link is a directory or a file and act accordingly
    if full_url.endswith('/'):
        # It's a directory; you might want to navigate further
        sub_page_html = get_html(full_url)
        sub_links = find_links(sub_page_html)
        # Process sub-links as needed
    else:
        # It's a file; download it
        download_file(full_url, 'ARGO_httpserver_files')

print("Done")

Processing: https://data-argo.ifremer.fr/argo_bio-profile_index.txt
Processing: https://data-argo.ifremer.fr/argo_bio-profile_index.txt.gz
Processing: https://data-argo.ifremer.fr/argo_bio-traj_index.txt
Processing: https://data-argo.ifremer.fr/argo_bio-traj_index.txt.gz
Processing: https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt
Processing: https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt.gz
Done


In [12]:
# would have to write code to process each url resulting 
# from concatenation base url with file name in 'file' column of those txt files

# manually navigate to single example url: https://data-argo.ifremer.fr/dac/aoml/13857/13857_*.nc

import os
from netCDF4 import Dataset
# Folder path where files are stored
folder_path = "ARGO_httpserver_files/example/"
# List all files in the directory
file_list = os.listdir(folder_path)

# Filter for .nc files
nc_files = [file for file in file_list if file.endswith('.nc')]

# Read and print the content of each .nc file
for file_name in nc_files:
    file_path = os.path.join(folder_path, file_name)
    print(f"Reading file: {file_name}")
    
    # Open the .nc file
    with Dataset(file_path, mode='r') as nc_file:
        # Print file information
        print(nc_file)

        # Print variable names
        print("\nVariables:")
        print([var for var in nc_file.variables])

        # Count dimensions, variables, and global attributes (note each variables can have specific attributes)
        num_dimensions = len(nc_file.dimensions)
        num_variables = len(nc_file.variables)
        num_global_attributes = len(nc_file.ncattrs())
        
        print(f"\nNumber of dimensions: {num_dimensions}")
        print(f"Number of variables: {num_variables}")
        print(f"Number of global attributes: {num_global_attributes}")



Reading file: 13857_tech.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    title: Argo float technical data file
    institution: AOML
    source: Argo float
    history: 2021-04-28T20:03:35Z creation
    references: http://www.argodatamgt.org/Documentation
    comment: free text
    user_manual_version: 3.1
    Conventions: Argo-3.1 CF-1.6
    dimensions(sizes): DATE_TIME(14), STRING128(128), STRING32(32), STRING8(8), STRING4(4), STRING2(2), N_TECH_PARAM(421)
    variables(dimensions): |S1 PLATFORM_NUMBER(STRING8), |S1 DATA_TYPE(STRING32), |S1 FORMAT_VERSION(STRING4), |S1 HANDBOOK_VERSION(STRING4), |S1 DATA_CENTRE(STRING2), |S1 DATE_CREATION(DATE_TIME), |S1 DATE_UPDATE(DATE_TIME), |S1 TECHNICAL_PARAMETER_NAME(N_TECH_PARAM, STRING128), |S1 TECHNICAL_PARAMETER_VALUE(N_TECH_PARAM, STRING128), int32 CYCLE_NUMBER(N_TECH_PARAM)
    groups: 

Variables:
['PLATFORM_NUMBER', 'DATA_TYPE', 'FORMAT_VERSION', 'HANDBOOK_VERSION', 'DATA_CENTRE', 

In [13]:
#explore 13857_prof.nc file in search of observation data points of a parameter 
file_path = os.path.join("ARGO_httpserver_files/example/", "13857_prof.nc")
print("Reading file: 13857_prof.nc")

# Open the .nc file
with Dataset(file_path, mode='r') as nc_file:
    # Print file information
    print(nc_file)

    # Print variable names
    print("\nVariables:")
    print([var for var in nc_file.variables])

    # Count dimensions, variables, and global attributes (note each variables can have specific attributes)
    num_dimensions = len(nc_file.dimensions)
    num_variables = len(nc_file.variables)
    num_global_attributes = len(nc_file.ncattrs())
    
    print(f"\nNumber of dimensions: {num_dimensions}")
    print(f"Number of variables: {num_variables}")
    print(f"Number of global attributes: {num_global_attributes}")


Reading file: 13857_prof.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    title: Argo float vertical profile
    institution: FR GDAC
    source: Argo float
    history: 2019-10-24T23:47:09Z creation
    references: http://www.argodatamgt.org/Documentation
    user_manual_version: 3.1
    Conventions: Argo-3.1 CF-1.6
    featureType: trajectoryProfile
    dimensions(sizes): DATE_TIME(14), STRING256(256), STRING64(64), STRING32(32), STRING16(16), STRING8(8), STRING4(4), STRING2(2), N_PROF(140), N_PARAM(2), N_LEVELS(113), N_CALIB(1), N_HISTORY(0)
    variables(dimensions): |S1 DATA_TYPE(STRING16), |S1 FORMAT_VERSION(STRING4), |S1 HANDBOOK_VERSION(STRING4), |S1 REFERENCE_DATE_TIME(DATE_TIME), |S1 DATE_CREATION(DATE_TIME), |S1 DATE_UPDATE(DATE_TIME), |S1 PLATFORM_NUMBER(N_PROF, STRING8), |S1 PROJECT_NAME(N_PROF, STRING64), |S1 PI_NAME(N_PROF, STRING64), |S1 STATION_PARAMETERS(N_PROF, N_PARAM, STRING16), int32 CYCLE_NUMBER(N_PROF), |S1 

### Analysis results:

- data is findable & accessible
    - endpoint is a file server, which is navigatable via html hrefs
    - more intended for human navigation rather then automated machine access (other services better)

- data granularity:
    - to data point level, describing measurements/observations
    - does require addiontal step of combining base-url and paths mentioned in the files


- semantics of metadata:
    - different levels of metadata: on the files, on the data itself 
    - metadata of data itself limited 
    - knowledge on the data model used to structure the data is required to be able to use the data
    - use of codes rather than externally defined standard terms, e.g. BODC term for temperature, 
        - makes semantics unambiguous 
        - more difficult to integrate with other data sources
        - options for standard terms: e.g. BODC term for temperature
    - in some cases the semantics are not clear, e.g. use of codes for ocean, parameters, ... 
    - information relating to semantics can be found at different location, namely in the documentation: https://argo.ucsd.edu/data/data-from-gdacs/