In [1]:
import sys
!{sys.executable} -m pip install rdflib



In [45]:
import sys
!{sys.executable} -m pip install pandas numpy

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.2.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Downloading numpy-2.2.3-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------- ----------------------- 5.2/12.6 MB 26.6 MB/s eta 0:00:01
   ------------------------------------ --- 11.5/12.6 MB 27.8 MB/s eta 0:00:01
   ---------------------------------------- 12.6/12.6 MB 25.5 MB/s eta 0:00:00
Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully instal

In [317]:
#namespaces
maSMP_ns = "https://discovery.biothings.io/view/maSMP/"
maSMPPro_ns = "https://discovery.biothings.io/view/maSMPProfiles/"
bioschemas = "https://bioschemas.org/terms/"
codemeta = "https://w3id.org/codemeta/"
schema_ns = "http://schema.org/" #double check whether http or https is used in the source file

## Create an index page for Profiles

In [249]:
from rdflib import Graph
from rdflib import URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL

from pandas import DataFrame

In [170]:
#properties used for filtering triples
rdfs_class = URIRef("http://www.w3.org/2000/01/rdf-schema#Class")
schema_domain = URIRef(schema_ns + "domainIncludes")
schema_range = URIRef(schema_ns + "rangeIncludes")

origin = "https://raw.githubusercontent.com/zbmed-semtec/maSMPs/refs/heads/main/schema/maSMP_schema_v2/v2.1.0/profiles/maSMP_profiles_v2.jsonld"

In [171]:
#load graph
g = Graph()
g.parse(origin)

<Graph identifier=N6423be45d1214934bcfc4836985c87d4 (<class 'rdflib.graph.Graph'>)>

In [12]:
#Create a generic table listing all the types and their descriptions

table = "<table>\n" + "<tr><th>Profile</th><th>Description</th></tr>\n"

#iterate over all classes in the maSMP namespace to get type and description
for s in g.subjects(object=rdfs_class, unique=True) :
    type_name = str(s).split('/')[-1]
    if maSMPPro_ns in s :
        description = g.value(subject=s, predicate=RDFS.comment)
        table += "<tr><td><a href='./" + type_name[:-len("Profile")] + "'>" + type_name + "</a></td><td>" + description + "</td></tr>\n\n"

table += "</table>\n"

In [13]:
# Create the Profiles page
intro_text = """
<h1>maSMP Profiles</h1>\n
maSMP profiles are recommendations of use corresponding to [maSMP](./Types/index.md) and [schema.org](https://schema.org) types and properties. 
Recommendations relate to the cardinality (one, many) and marginality (minimum, recommended, optional). 
The json.schema corresponding to the maSMP profiles can be found at the [maSMP DDE namespace](https://discovery.biothings.io/ns/maSMPProfiles){:target="_blank"}.
\n\n
To avoid conflict/confusion between types and profiles, all the profile names end with the suffix 'Profile'
\n\n
"""

with open("../docs/Profiles/index.md", 'w') as file:
    file.write(intro_text)
    file.write(table)

## Create individual pages, one per profile

In [124]:
import urllib.request, json
import re

In [318]:
def_value = maSMP_ns
def convert_to_link(url, maSMP_ns=def_value, label=None, md=False) :
    url = str(url) #just in case it is not plain text but a URIRef
    local = True if url.startswith("maSMP:") else False
    
    url = url.replace("maSMP:", "../../Types/")    
    if label is None:
        if (bioschemas in url) :
            label = "bioschemas:" + str(url).split('/')[-1]
        elif (codemeta in url) :
            label = "codemeta:" + str(url).split('/')[-1]
        else : 
            label = str(url).split('/')[-1]

    if local :
        label = "maSMP:" + label
        if md :
            return "[{}]({})".format(label, url)
        else :
            return "<a href='{}'>{}</a>".format(url, label)
    else :   
        if maSMP_ns in url :
            return "maSMP:" + label
        else :
            if md :
                return "[{}]({})".format(label, url) + "{:target='_blank'}"
            else :
                return "<a href='{}' target='_blank'>{}</a>".format(url, label)

print(convert_to_link("maSMP:SoftwareRunAction"))

<a href='../../Types/SoftwareRunAction'>maSMP:SoftwareRunAction</a>


In [178]:
def clean_text(text) :
    return re.sub(r"(\[\[)(\w+)(\]\])", r"<a href='https://schema.org/\g<2>' target='_blank'>\g<2></a>", text)

print(clean_text("this is a [[Thing]] in md"))

this is a <a href='https://schema.org/Thing' target='_blank'>Thing</a> in md


In [177]:
def column_to_table(value, pos) :
    if pos == "first" :
        return "<tr><td>" + value + "</td>"
    elif pos == "last" :
        return "<td>" + value + "</td></tr>"
    else :
        return "<td>" + value + "</td>"

In [179]:
def df_to_table(df) :
    table = "<table>\n" + "<tr><td>Property</td><td>Expected Type</td><td>Description</td><td>Cardinality</td></tr>\n" 
    
    i = 0
    num_cols = df.shape[1]
    
    for column in df :
        df[column] = df[column].apply(clean_text)
        if i == 0 :
            df[column] = df[column].apply(column_to_table, pos="first")
        elif i == num_cols - 1 :
            df[column] = df[column].apply(column_to_table, pos="last")
        else :
            df[column] = df[column].apply(column_to_table, pos="middle")
        i += 1

    table += df.to_string(header=False, index=False, index_names=False)
    table += "</table>\n"
    
    return (table)

In [320]:
def_schemaorg = schema_ns
def find_expected_type(g_types, item, schema_ns=def_schemaorg) :
    found = False
    for s in g_types.subjects(RDFS.label, Literal(item), unique=True) :
        found = True
        ranges = list(map(convert_to_link, g_types.objects(s, schema_range)))
        return " or ".join(map(convert_to_link, g_types.objects(s, schema_range))), convert_to_link(s)
         #labels are unique, only one subject will be found

    #in case the property was not explicitly defined -- schema.org prop
    schema_link = convert_to_link(schema_ns + item)
    return "See range for " + schema_link + " in schema.org", schema_link
#todo: load schema.org graph and look for the ranges for the missing ones        

In [321]:
def get_prop_list(g_types, elem, level) :
    lst = []
    for item in elem["$validation"][level] :
        expected, item_link = find_expected_type(g_types, item)
        new_row = {
            "Property": item_link, 
            "Expected Type": expected, 
            "Description": elem["$validation"]["properties"][item]["description"], 
            "Cardinality": elem["$validation"]["properties"][item]["owl:cardinality"]
        }
        lst.append(new_row)
    return lst

In [322]:
lst_min = []
lst_rec= []
lst_opt = []

with urllib.request.urlopen(origin) as prof_file:
    prof_data = json.load(prof_file)

#load types graph (rather than profiles)
g_types = Graph()
g_types.parse("https://raw.githubusercontent.com/zbmed-semtec/maSMPs/refs/heads/main/schema/maSMP_schema_v2/maSMP_v2.jsonld")

for elem in prof_data["@graph"] :
    if elem["@type"] == "rdfs:Class" :
        intro_text = "<h1>" + elem["rdfs:label"] + "</h1>\n\n"
        intro_text += elem["rdfs:label"] + " is profile for " + convert_to_link(elem["rdfs:subClassOf"]["@id"]) + "\n\n"
        intro_text += elem["rdfs:comment"] + "\n\n"

        lst_min = get_prop_list(g_types, elem, "required")
        lst_rec = get_prop_list(g_types, elem, "recommended")
        lst_opt = get_prop_list(g_types, elem, "optional")

        with open("../docs/Profiles/" + (elem["rdfs:label"])[:-len("Profile")] + ".md", 'w') as file:
            file.write(intro_text)
            file.write("## Minimum properties\n\n")
            df_min = DataFrame(lst_min)
            file.write(df_to_table(df_min))
            
            file.write("\n## Recommended properties\n\n")
            file.write(df_to_table(DataFrame(lst_rec)))
            
            file.write("\n## Optional properties\n\n")
            file.write(df_to_table(DataFrame(lst_opt)))
            
        