In [None]:
import polars as pl
import requests
from io import StringIO
from rdflib import Graph
import os

def read_rdf_from_url(url: str) -> pl.DataFrame:
    """Fetch JSON-LD or RDF data from a URL and convert it to a polars DataFrame."""

    # Fetch RDF/JSON-LD data from the URL
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for failed requests

    # Load into rdflib Graph
    g = Graph()
    g.parse(data=response.text, format="json-ld")  # Try JSON-LD format first

    # Convert RDF triples to a list of dictionaries
    data = [{"subject": str(s), "predicate": str(p), "object": str(o)} for s, p, o in g]

    # Convert to Polars DataFrame
    return pl.DataFrame(data)

def read_csv_from_url(url: str) -> pl.DataFrame:
    """Read a CSV file from a URL into a polars DataFrame."""
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for failed requests
    csv_data = StringIO(response.text)
    return pl.read_csv(csv_data)

# Create folder structure
folders = ["mappings", "model", "resources", "wmdr2", "temp"]
for folder in folders:
    os.makedirs(folder, exist_ok=True)

In [None]:
# Retrieve code tables from codes.wmo.int/wmdr
# ObservedVariableAtmosphere types
url = "https://codes.wmo.int/wmdr/ObservedVariableAtmosphere?_format=csv&status=valid"
df_wmdr_ObservedVariableAtmosphere = read_csv_from_url(url)
df_wmdr_ObservedVariableAtmosphere.write_csv(file="resources/wmdr_observed_variable_atmosphere.csv")

# Domain types
url = "https://codes.wmo.int/wmdr/Domain?_format=csv&status=valid"
df_wmdr_Domain = read_csv_from_url(url)
df_wmdr_Domain.write_csv(file="resources/wmdr_domain.csv")

# Geometry types
url = "https://codes.wmo.int/wmdr/Geometry?_format=csv&status=valid"
df_wmdr_Geometry = read_csv_from_url(url)
df_wmdr_Geometry.write_csv(file="resources/wmdr_geometry.csv")

# Matrix types
url = "https://codes.wmo.int/wmdr/Matrix?_format=csv&status=valid"
df_wmdr_Matrix = read_csv_from_url(url)
df_wmdr_Matrix.write_csv(file="resources/wmdr_matrix.csv")

In [None]:
# I-ADOPT model (cf. https://umltool.ogc.org/index.php?m=7&o=2F3BC921-1482-4aab-B393-5FFDB6186EA9)
# classes
df_iadopt_classes = pl.DataFrame({
    "id": [1,2,3,4,5,],
    "name": ["Variable", "Property", "Entity", "Constraint", "VariableSet",],
})
df_iadopt_classes.write_csv(file="model/iadopt_class.csv")

# entity roles
df_iadopt_roles = pl.DataFrame({
    "id": [1,2,3,4,],
    "name": ['unclear','object_of_interest','matrix','context',],
    "value": [None,1,2,4,],
    })
df_iadopt_roles.write_csv(file="model/iadopt_role.csv")


In [None]:
# Disaggregate existing WMDR ObservedVariableAtmosphere types using the mappings as suggested by Morgan and Gao, March 2025
df_mapping = pl.read_csv("mappings/WMDR_ObservedVariableAtmosphere_IADOPT_mapping_v01.csv")
df_mapping = df_mapping.drop(["SeqNum", "@id", "@notation", "@status", "dct:description", "rdf:type", "rdfs:label"])
df_mapping.schema

df = pl.concat([df_wmdr_ObservedVariableAtmosphere, df_mapping], how="align")
display(df.head())

In [None]:
# Convert 'I-ADOPT Type' to various role columns for further treatment
df = df.with_columns(
    pl.col('I-ADOPT Type').str.contains("OoI").alias("has_ooi"),
    pl.col('I-ADOPT Type').str.contains("Property").alias("has_property"),
)
df = df.drop(["@id", "rdf:type", "I-ADOPT Type", "OoI"])
df.describe()

In [None]:
# Extract information from the df dataframe, but leave additional columns ntouched for now. Eventually, the notations will be removed and new ones coined.

# extract OoI
df_wmdr2_object_of_interest_types = df.filter((pl.col('has_ooi')==True) & (pl.col("has_property")==False))
df_wmdr_object_of_interest_types = df_wmdr2_object_of_interest_types.drop(["has_ooi", "has_property", "Property"])
display(df_wmdr2_object_of_interest_types.head())
df_wmdr2_object_of_interest_types.write_csv("temp/wmdr2_object_of_interest.csv")

# extract property
df_wmdr2_property_types = df.filter((pl.col('has_ooi')==False) & (pl.col("has_property")==True))
df_wmdr2_property_types = df_wmdr2_property_types.drop(["has_ooi", "has_property"])
display(df_wmdr2_property_types.head())
df_wmdr2_property_types.write_csv(file="temp/wmdr2_property.csv")

# extract *mixed* types. These will need to be addressed manually
df_wmdr2_mixed_types =  df.filter(
    ((pl.col('has_ooi')==True) & (pl.col("has_property")==True))
    | ((pl.col('has_ooi')==False) & (pl.col("has_property")==False)))
display(df_wmdr2_mixed_types.head())
df_wmdr2_mixed_types.write_csv(file="temp/wmdr2_mixed.csv")

In [None]:
# Retrieve vocabularies from ACTRIS
# property types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fvariablepropertyofinterest&format=application/ld%2Bjson"
df_actris_property_types = read_rdf_from_url(url)
df_actris_property_types.write_csv(file="resources/actris_property.csv")

# object_of_interest types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fobjectofinterest&format=application/ld%2Bjson"
df_actris_object_of_interest_types = read_rdf_from_url(url)
df_actris_object_of_interest_types.write_csv(file="resources/actris_object_of_interest.csv")

# (observation) geometry types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fvariablegeometry&format=application/ld%2Bjson"
df_actris_geometry_types = read_rdf_from_url(url)
df_actris_geometry_types.write_csv(file="resources/actris_geometry.csv")

# matrix types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fvariablematrix&format=application/ld%2Bjson"
df_actris_matrix_types = read_rdf_from_url(url)
df_actris_matrix_types.write_csv(file="resources/actris_matrix.csv")

# variable constraint types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fvariableconstraints&format=application/ld%2Bjson"
df_actris_constraint_types = read_rdf_from_url(url)
df_actris_constraint_types.write_csv(file="resources/actris_constraint.csv")

# variable group types
url = "https://vocabulary.actris.nilu.no/skosmos/rest/v1/actris_vocab/data?uri=https%3A%2F%2Fvocabulary.actris.nilu.no%2Factris_vocab%2Fvariablegroup&format=application/ld%2Bjson"
df_actris_variable_group_types = read_rdf_from_url(ur)
df_actris_variable_group_types.write_csv(file="resources/actris_variable_group.csv")