## Figures Assessing bioinformatics software annotations: bio.tools case-study

### Figure 1 : Direct and inferred EDAM annotations for the scientific topic of Qiime2 and Vsearch, two metagenomics tools

In [17]:
#Python 3.13.1
#Import library 
import pandas
import IPython
import collections
import pygraphviz as pgv
#Read csv 
dfTool = pandas.read_csv("results/dfTool.tsv.bz2", sep="\t") #all Tool and toolLabel
dfToolTopicTransitive = pandas.read_csv("results/dfToolTopicTransitive.tsv.bz2", sep="\t")#Tool, topic, topicLabel transitive
dfToolOperationTransitive = pandas.read_csv("results/dfToolOperationTransitive.tsv.bz2", sep="\t")# tool, operation, operationLabel transitive
#Import prefix : 
prefixes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

PREFIX bt: <https://bio.tools/>
PREFIX biotools: <https://bio.tools/ontology/>
PREFIX bsc: <http://bioschemas.org/>
PREFIX bsct: <http://bioschemas.org/types/>
PREFIX edam: <http://edamontology.org/>
PREFIX sc: <http://schema.org/>
PREFIX schema: <https://schema.org/>

"""

#Link tool
biotoolsURI = "https://bio.tools/"
biotoolsOntologyURI = "https://bio.tools/ontology/"
edamURI = "http://edamontology.org/"

In [6]:
dfToolOperationTransitive

Unnamed: 0,tool,operation,operationLabel
0,https://bio.tools/pmirkb,http://edamontology.org/operation_2428,Validation
1,https://bio.tools/pmirkb,http://edamontology.org/operation_0004,Operation
2,https://bio.tools/pmirkb,http://edamontology.org/operation_0484,SNP detection
3,https://bio.tools/pmirkb,http://edamontology.org/operation_3227,Variant calling
4,https://bio.tools/pmirkb,http://edamontology.org/operation_2478,Nucleic acid sequence analysis
...,...,...,...
249653,https://bio.tools/gbc,http://edamontology.org/operation_2480,Structure analysis
249654,https://bio.tools/gbc,http://edamontology.org/operation_0244,Simulation analysis
249655,https://bio.tools/gbc,http://edamontology.org/operation_0250,Protein property calculation
249656,https://bio.tools/gbc,http://edamontology.org/operation_3438,Calculation


In [13]:
def get_tool_url(tool_name: str) -> str:
    """Returns the full bio.tools URL for a given tool name, ensuring the correct format."""
    if tool_name.startswith("https://bio.tools/"):
        return tool_name
    return f"https://bio.tools/{tool_name}"


def getToolTopics(tool):
    """
    Returns a list of (topic, topicLabel) tuples for the given tool.
    """
    filtered = dfToolTopicTransitive[dfToolTopicTransitive['tool'] == tool]
    return list(filtered[['topic', 'topicLabel']].itertuples(index=False, name=None))


def getToolOperations(tool):
    """
    Returns a list of (operation, operationLabel) tuples for the given tool.
    """
    filtered = dfToolOperationTransitive[dfToolOperationTransitive['tool'] == tool]
    return list(filtered[['operation', 'operationLabel']].itertuples(index=False, name=None))


def getToolsCommonTopics(listToolURI, transitive=False):
    """Return the list of the (URI, label) tuples for the topics associated to all the tools of a list.

    Keyword arguments:
    listToolURI -- list of the URIs for the tools
    transitive -- also consider the ancestors of the topics directly associated to a tool (default: False)
    """
    commonConcepts = []
    if len(listToolURI) > 0:
        commonConcepts = set(getToolTopics(listToolURI[0], transitive=transitive))
    for toolURI in listToolURI[1:]:
        currentConcepts = set(getToolTopics(toolURI, transitive=transitive))
        commonConcepts = commonConcepts.intersection(currentConcepts)
    return list(commonConcepts)


def getToolsCommonOperations(listToolURI, transitive=False):
    """Return the list of the (URI, label) tuples for the operations associated to all the tools of a list.

    Keyword arguments:
    listToolURI -- list of the URIs for the tools
    transitive -- also consider the ancestors of the operations directly associated to a tool (default: False)
    """
    commonConcepts = []
    if len(listToolURI) > 0:
        commonConcepts = set(getToolOperations(listToolURI[0], transitive=transitive))
    for toolURI in listToolURI[1:]:
        currentConcepts = set(getToolOperations(toolURI, transitive=transitive))
        commonConcepts = commonConcepts.intersection(currentConcepts)
    return list(commonConcepts)


def colorGraphNodesAccordingToScore(graph, dictTopicScore, dictOperationScore, color="red"):
    """Modify nodes color according to a score associated to the node. Nodes that are not associated to a score are unaffected.

    Keyword arguments:
    graph -- the graph to be modified
    dictTopicScore -- a dictionary {nodeIdent -> score} for topic nodes
    dictOperationScore -- a dictionary {nodeIdent -> score} for operation nodes
    color -- name of the color that will vary according to scores. Possible values are "red", "green" or "blue" (default: "red")
    """
    topicMaxValue = max(dictTopicScore.values(), default=0)
    operationMaxValue = max(dictOperationScore.values(), default=0)
    for currentNode in graph.nodes_iter():
        if "nodeType" in currentNode.attr.keys():
            currentNodeType = currentNode.attr['nodeType']
            #currentNode.attr['style'] = conceptStyle[currentNodeType]
            if currentNodeType == "Topic":
                if currentNode in dictTopicScore.keys():
                    graph.get_node(currentNode).attr['fillcolor'] = getScoreColorRGB(dictTopicScore[currentNode], topicMaxValue, color=color)
            elif currentNodeType == "Operation":
                if currentNode in dictOperationScore.keys():
                    graph.get_node(currentNode).attr['fillcolor'] = getScoreColorRGB(dictOperationScore[currentNode], operationMaxValue, color=color)


def addToolsAndAnnotationsToGraph(listToolURI, graph=None, showTopics=True, showOperations=True, highlightDirectAnnotations=False, highlightIntersection=False):
    """Return a graph representing tools and their EDAM annotations.

    Keyword arguments:
    listToolURI -- list of the URIs for the tools
    graph -- the graph in which the tool and its annotations are added. A new graph is created if the value is None. (default: None)
    showTopics -- should the topics annotating the tool be considered (default: True)
    showOperations -- should the operations annotating the tool be considered (default: True)
    highlightDirectAnnotations -- should the topics or operations annotated directly be highligthed (default:False)
    highlightIntersection -- should the common topics or operation common to all the tools be highlighted
    """
    if graph is None:
        graph = pgv.AGraph(directed=True, rankdir="BT")
    for toolURI in listToolURI:
        addToolAndAnnotationsToGraph(toolURI, graph=graph, showTopics=showTopics, showOperations=showOperations, highlightDirectAnnotations=highlightDirectAnnotations)
    if highlightIntersection:
        commonConcepts = []
        if showTopics:
            commonConcepts += getToolsCommonTopics(listToolURI, transitive=True)
        if showOperations:
            commonConcepts += getToolsCommonOperations(listToolURI, transitive=True)
        for (currentConcept, currentLabel) in commonConcepts:
            currentConceptIdent = currentConcept.replace(edamURI, "")
            graph.get_node(currentConceptIdent).attr['color'] = "red"
    return graph


def getScoreColorRGB(scoreValue, scoreMaxValue, color="red"):
    """Return the RGB color (in hex) associated to a score, varying from white (0 score)
    to a target color at the maximum score.

    Keyword arguments:
    scoreValue -- the score to consider (positive number)
    scoreMaxValue -- the maximum value for the score (positive number, >= scoreValue)
    color -- name of the color for the gradient. Possible values are:
             "red", "green", "blue", "orange", "yellow", "pink", "grey".
             (default: "red")
    """
    # Avoid division by 0.
    scoreMaxValue = max(1, scoreMaxValue)
    fraction = scoreValue / scoreMaxValue

    # Define the target color values.
    if color == "red":
        target = (255, 0, 0)
    elif color == "green":
        target = (0, 255, 0)
    elif color == "blue":
        target = (0, 0, 255)
    elif color == "orange":
        target = (255, 165, 0)
    elif color == "yellow":
        target = (255, 255, 0)
    elif color == "pink":
        target = (255, 192, 203)  # Light pink (alternative: hot pink (255,105,180))
    elif color == "grey":
        target = (128, 128, 128)
    else:
        return "#ffffff"

    # Interpolate from white (255,255,255) to the target color.
    r = int(255 - fraction * (255 - target[0]))
    g = int(255 - fraction * (255 - target[1]))
    b = int(255 - fraction * (255 - target[2]))
    
    return "#{:02x}{:02x}{:02x}".format(r, g, b)

In [19]:
tools = ["qiime2","vsearch"]
nbTools = len(dfTool)
print("Nb tools: {}".format(nbTools))
"-".join(sorted(tools)) + "_topics_operations"

fileFormat = "png"

listToolURI = [biotoolsURI+x for x in tools]
commonGraph = addToolsAndAnnotationsToGraph(listToolURI, graph=None, showTopics=True, showOperations=False, highlightDirectAnnotations=False, highlightIntersection=True)
display(IPython.core.display.SVG(commonGraph.draw(prog='dot',format='svg')))
fileName = "-".join(sorted(tools)) + "_operations"
commonGraph.draw(path='results/{}.{}'.format(fileName, fileFormat), prog='dot',format=fileFormat)

Nb tools: 29993


NameError: name 'addToolAndAnnotationsToGraph' is not defined

In [18]:


# color by the number of tools
dictTopicNbOccurrences = collections.defaultdict(lambda: 0)
dictOperationsNbOccurrences = collections.defaultdict(lambda: 0)
for currentTool in tools:
    for currentTopic in [x.replace(edamURI, "") for x in dfToolTopicTransitive[dfToolTopicTransitive['tool'] == biotoolsURI+currentTool]['topic'].to_list()]:
        dictTopicNbOccurrences[currentTopic] += 1
    for currentOperation in [x.replace(edamURI, "") for x in dfToolOperationTransitive[dfToolOperationTransitive['tool'] == biotoolsURI+currentTool]['operation'].to_list()]:
        dictOperationsNbOccurrences[currentOperation] += 1

colorGraphNodesAccordingToScore(commonGraph, dictTopicNbOccurrences, dictOperationsNbOccurrences, color="orange")
display(IPython.core.display.SVG(commonGraph.draw(prog='dot',format='svg')))
fileName = "-".join(sorted(tools)) + "Metagenomics_topics_nbannotations_colors"
commonGraph.draw(path='results/{}.{}'.format(fileName, fileFormat), prog='dot',format=fileFormat)

NameError: name 'commonGraph' is not defined