In [1]:
import pandas as pd

### Argo HTML file server

In [1]:
argo_url = "https://data-argo.ifremer.fr/"

In [3]:
import requests
import os 
from bs4 import BeautifulSoup

# Function to get the HTML content of a webpage
def get_html(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    return response.text

# Function to find all links on a webpage
def find_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all('a')
    return [link.get('href') for link in links if link.get('href')]

# Function to download a file
def download_file(url, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    filename = os.path.join(dest_folder, url.split('/')[-1])
    with open(filename, 'wb') as file:
        file.write(response.content)



In [5]:
# Navigating HTTP server

base_url = "https://data-argo.ifremer.fr/"

# Example usage steps
# 1. Get the main page HTML content & Find all links on the main page
main_page_html = get_html(base_url)
links = find_links(main_page_html)

# 3. Filter links of interest (e.g., links to specific data files or directories)
# For demonstration, assume we are interested in links containing 'argo'
argo_links = [link for link in links if 'argo' in link]

# 4. Navigate further or download files from filtered links
for link in argo_links:
    full_url = base_url + link
    print(f"Processing: {full_url}")
    # Check if the link is a directory or a file and act accordingly
    if full_url.endswith('/'):
        # It's a directory; you might want to navigate further
        sub_page_html = get_html(full_url)
        sub_links = find_links(sub_page_html)
        # Process sub-links as needed
    else:
        # It's a file; download it
        download_file(full_url, 'ARGO_httpserver_files')

print("Done")

Processing: https://data-argo.ifremer.fr/argo_bio-profile_index.txt
Processing: https://data-argo.ifremer.fr/argo_bio-profile_index.txt.gz
Processing: https://data-argo.ifremer.fr/argo_bio-traj_index.txt
Processing: https://data-argo.ifremer.fr/argo_bio-traj_index.txt.gz
Processing: https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt
Processing: https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt.gz
Done


In [12]:
# would have to write code to process each url resulting 
# from concatenation base url with file name in 'file' column of those txt files

# manually navigate to single example url: https://data-argo.ifremer.fr/dac/aoml/13857/13857_*.nc

import os
from netCDF4 import Dataset
# Folder path where files are stored
folder_path = "ARGO_httpserver_files/example/"
# List all files in the directory
file_list = os.listdir(folder_path)

# Filter for .nc files
nc_files = [file for file in file_list if file.endswith('.nc')]

# Read and print the content of each .nc file
for file_name in nc_files:
    file_path = os.path.join(folder_path, file_name)
    print(f"Reading file: {file_name}")
    
    # Open the .nc file
    with Dataset(file_path, mode='r') as nc_file:
        # Print file information
        print(nc_file)

        # Print variable names
        print("\nVariables:")
        print([var for var in nc_file.variables])

        # Count dimensions, variables, and global attributes (note each variables can have specific attributes)
        num_dimensions = len(nc_file.dimensions)
        num_variables = len(nc_file.variables)
        num_global_attributes = len(nc_file.ncattrs())
        
        print(f"\nNumber of dimensions: {num_dimensions}")
        print(f"Number of variables: {num_variables}")
        print(f"Number of global attributes: {num_global_attributes}")



Reading file: 13857_tech.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    title: Argo float technical data file
    institution: AOML
    source: Argo float
    history: 2021-04-28T20:03:35Z creation
    references: http://www.argodatamgt.org/Documentation
    comment: free text
    user_manual_version: 3.1
    Conventions: Argo-3.1 CF-1.6
    dimensions(sizes): DATE_TIME(14), STRING128(128), STRING32(32), STRING8(8), STRING4(4), STRING2(2), N_TECH_PARAM(421)
    variables(dimensions): |S1 PLATFORM_NUMBER(STRING8), |S1 DATA_TYPE(STRING32), |S1 FORMAT_VERSION(STRING4), |S1 HANDBOOK_VERSION(STRING4), |S1 DATA_CENTRE(STRING2), |S1 DATE_CREATION(DATE_TIME), |S1 DATE_UPDATE(DATE_TIME), |S1 TECHNICAL_PARAMETER_NAME(N_TECH_PARAM, STRING128), |S1 TECHNICAL_PARAMETER_VALUE(N_TECH_PARAM, STRING128), int32 CYCLE_NUMBER(N_TECH_PARAM)
    groups: 

Variables:
['PLATFORM_NUMBER', 'DATA_TYPE', 'FORMAT_VERSION', 'HANDBOOK_VERSION', 'DATA_CENTRE', 

In [13]:
#explore 13857_prof.nc file in search of observation data points of a parameter 
file_path = os.path.join("ARGO_httpserver_files/example/", "13857_prof.nc")
print("Reading file: 13857_prof.nc")

# Open the .nc file
with Dataset(file_path, mode='r') as nc_file:
    # Print file information
    print(nc_file)

    # Print variable names
    print("\nVariables:")
    print([var for var in nc_file.variables])

    # Count dimensions, variables, and global attributes (note each variables can have specific attributes)
    num_dimensions = len(nc_file.dimensions)
    num_variables = len(nc_file.variables)
    num_global_attributes = len(nc_file.ncattrs())
    
    print(f"\nNumber of dimensions: {num_dimensions}")
    print(f"Number of variables: {num_variables}")
    print(f"Number of global attributes: {num_global_attributes}")


Reading file: 13857_prof.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    title: Argo float vertical profile
    institution: FR GDAC
    source: Argo float
    history: 2019-10-24T23:47:09Z creation
    references: http://www.argodatamgt.org/Documentation
    user_manual_version: 3.1
    Conventions: Argo-3.1 CF-1.6
    featureType: trajectoryProfile
    dimensions(sizes): DATE_TIME(14), STRING256(256), STRING64(64), STRING32(32), STRING16(16), STRING8(8), STRING4(4), STRING2(2), N_PROF(140), N_PARAM(2), N_LEVELS(113), N_CALIB(1), N_HISTORY(0)
    variables(dimensions): |S1 DATA_TYPE(STRING16), |S1 FORMAT_VERSION(STRING4), |S1 HANDBOOK_VERSION(STRING4), |S1 REFERENCE_DATE_TIME(DATE_TIME), |S1 DATE_CREATION(DATE_TIME), |S1 DATE_UPDATE(DATE_TIME), |S1 PLATFORM_NUMBER(N_PROF, STRING8), |S1 PROJECT_NAME(N_PROF, STRING64), |S1 PI_NAME(N_PROF, STRING64), |S1 STATION_PARAMETERS(N_PROF, N_PARAM, STRING16), int32 CYCLE_NUMBER(N_PROF), |S1 

#### Notes/findings:
- file server, navigatable via html hrefs
- looks like intended for human navigation
- requires knowledge on the data model
- file metadata is limited
  - doesn't describe the content of the files
  - semantics are not clear
    - often use of codes (e.g. ocean, parameters, ...)
    - limited use/no reference to standard terms
- data granularity goes up to data point level (but requires combining base-url and paths mentioned in the files)


A bit of information relating to semantics in documentation: https://argo.ucsd.edu/data/data-from-gdacs/