# Imports

In [1]:
import pandas as pd
import seaborn as sns
from owlready2 import get_ontology, Thing
import matplotlib.pyplot as plt
import copy
import numpy as np
from adjustText import adjust_text
from googleapiclient.discovery import build
from datetime import timedelta
import isodate





# Getting Data from CoDex

In [2]:
tutorials_url = "../docs/supplementary/supplementary_table_4.tsv"
tools_url = "../docs/supplementary/supplementary_table_2.tsv"
workflows_url = "../docs/supplementary/supplementary_table_3.tsv"

tutorials_df = pd.read_csv(tutorials_url, sep="\t")
tutorials_df = tutorials_df.drop_duplicates(keep="last")

tools_df = pd.read_csv(tools_url, sep="\t")

workflows_df = pd.read_csv(workflows_url, sep="\t")


# Set up your API key

In [None]:
API_KEY = ''  # Replace with your  youtube API key
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Tutorials Statistics

## Create new rows, one row for each EDAM term per tutorial

#### EDAM Operations

In [4]:
inflated_tutorials_operation = []
for idx, row in tutorials_df.iterrows():
    terms = row["EDAM operation"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM operation single"] = val
            inflated_tutorials_operation.append(row_c)
    else:
        row_c = copy.copy(row_d)
        row_c["EDAM operation single"] = "Undefined"
        inflated_tutorials_operation.append(row_c)

df_inflated_tutorials_operation = pd.DataFrame(inflated_tutorials_operation)

df_inflated_tutorials_operation.head()

Unnamed: 0,Topic,Title,Link,EDAM topic,EDAM operation,Creation,Last modification,Version,Tutorial,Slides,...,Tools,Servers with precise tool versions,Servers with tool but different versions,Feedback number,Feedback mean note,Visitors,Page views,Visit duration,Video views,EDAM operation single
0,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Sequence visualisation
1,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Read mapping
2,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Genome visualisation
3,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Sequence alignment
4,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Mapping


#### EDAM topics

In [5]:
inflated_tutorials_topics = []
for idx, row in tutorials_df.iterrows():
    terms = row["EDAM topic"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM topic single"] = val
            inflated_tutorials_topics.append(row_c)
    else:
        row_c = copy.copy(row_d)
        row_c["EDAM topic single"] = "Undefined"
        inflated_tutorials_topics.append(row_c)

df_inflated_tutorials_topics = pd.DataFrame(inflated_tutorials_topics)

df_inflated_tutorials_topics.head()

Unnamed: 0,Topic,Title,Link,EDAM topic,EDAM operation,Creation,Last modification,Version,Tutorial,Slides,...,Tools,Servers with precise tool versions,Servers with tool but different versions,Feedback number,Feedback mean note,Visitors,Page views,Visit duration,Video views,EDAM topic single
0,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Sequence assembly
1,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Genomics
2,Assembly,Making sense of a newly assembled genome,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Sequence visualisation, Read mapping, Genome v...",2018-06-14,2024-06-24,29,True,False,...,"addValue, jbrowse, collapse_dataset, tp_grep_t...",,,0,,6561,12796,2.966667,0,Microbiology
3,Assembly,Unicycler Assembly,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Genome assembly, Aggregation, Sequencing quali...",2017-10-11,2025-05-15,24,True,True,...,"unicycler, quast, prokka, fastqc, multiqc","UseGalaxy.eu, UseGalaxy.org (Main), UseGalaxy....","GalaxyTrakr, UseGalaxy.be, UseGalaxy.cz, UseGa...",0,,2600473,7071486,4.65,0,Sequence assembly
4,Assembly,Unicycler Assembly,https://training.galaxyproject.org//topics/ass...,"Sequence assembly, Genomics, Microbiology","Genome assembly, Aggregation, Sequencing quali...",2017-10-11,2025-05-15,24,True,True,...,"unicycler, quast, prokka, fastqc, multiqc","UseGalaxy.eu, UseGalaxy.org (Main), UseGalaxy....","GalaxyTrakr, UseGalaxy.be, UseGalaxy.cz, UseGa...",0,,2600473,7071486,4.65,0,Genomics


## Statistics

In [7]:
unique_tutorials = tutorials_df["Title"].nunique()
df2 = tutorials_df.drop_duplicates(subset='Title', keep="last")
unique_operations = df_inflated_tutorials_operation["EDAM operation single"].nunique()
unique_topics = df_inflated_tutorials_topics["EDAM topic single"].nunique()

# Count videos and workflows per tutorial with additional checks for string type
df2["Video Count"] = df2["Video"].apply(lambda x: len(x.split(',')) if isinstance(x, str) and x.strip() else 0)
df2["Workflow Count"] = df2["Workflows"].apply(lambda x: len(x.split(',')) if isinstance(x, str) and x.strip() else 0)

# Calculate tutorials with associated videos and workflows (count of True values in each column)
tutorials_with_videos = df2["Video"].sum()
tutorials_with_workflows = df2["Workflows"].sum()

# Calculate the total number of videos and workflows
total_videos = df2[df2["Video"] == True].shape[0]  # Total number of True values in Video column
total_workflows = df2[df2["Workflows"] == True].shape[0]  # Total number of True values in Workflows column

# Calculate percentages
percentage_with_videos = (tutorials_with_videos / unique_tutorials) * 100
percentage_with_workflows = (tutorials_with_workflows / unique_tutorials) * 100

# Print statistics
print(f"Total number of unique tutorials: {unique_tutorials}")
print(f"Total number of unique EDAM operations: {unique_operations}")
print(f"Total number of unique EDAM topics: {unique_topics}")
print(f"Total number of unique tutorials: {unique_tutorials}")
print(f"Total number of tutorials with associated videos: {tutorials_with_videos}")
print(f"Percentage of tutorials with videos: {percentage_with_videos:.2f}%")
print(f"Total number of videos: {total_videos}")

print(f"Total number of tutorials with associated workflows: {tutorials_with_workflows}")
print(f"Percentage of tutorials with workflows: {percentage_with_workflows:.2f}%")
print(f"Total number of workflows: {total_workflows}")


Total number of unique tutorials: 39
Total number of unique EDAM operations: 86
Total number of unique EDAM topics: 26
Total number of unique tutorials: 39
Total number of tutorials with associated videos: 18
Percentage of tutorials with videos: 46.15%
Total number of videos: 18
Total number of tutorials with associated workflows: 33
Percentage of tutorials with workflows: 84.62%
Total number of workflows: 33


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["Video Count"] = df2["Video"].apply(lambda x: len(x.split(',')) if isinstance(x, str) and x.strip() else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["Workflow Count"] = df2["Workflows"].apply(lambda x: len(x.split(',')) if isinstance(x, str) and x.strip() else 0)


## Tutorials Videos on Youtube

In [8]:
# Function to get video duration using YouTube API
def get_video_duration_api(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split('v=')[-1].split('&')[0] if 'youtube' in video_url else video_url.split('/')[-1]
        request = youtube.videos().list(part="contentDetails", id=video_id)
        response = request.execute()

        # Extract duration in ISO 8601 format
        items = response.get("items")
        if items:
            duration = items[0]["contentDetails"]["duration"]
            # Convert ISO 8601 to total seconds
            return parse_iso_duration(duration)
        else:
            print(f"No details found for video: {video_url}")
            return 0
    except Exception as e:
        print(f"Error processing video {video_url}: {e}")
        return 0

# Function to parse ISO 8601 duration to seconds

def parse_iso_duration(duration):
    duration_obj = isodate.parse_duration(duration)
    return int(duration_obj.total_seconds())

# Load the table containing YouTube links
file_path = 'tutorialsYoutube/tutorialsVideos.csv'
df = pd.read_csv(file_path)

# Column containing YouTube links
column_name = "Video_Link"

# Apply function to get video durations
print("Fetching video durations...\n")
df['Duration_seconds'] = df[column_name].apply(get_video_duration_api)

# Calculate total duration
total_duration_seconds = int(df['Duration_seconds'].sum())  # Convert to Python int
total_duration = timedelta(seconds=total_duration_seconds)

# Display the total duration
print("\nSummary of Video Durations:")
print(f"Total number of unique videos: {len(df)}")
print(f"Total duration (hh:mm:ss): {total_duration}")

# Save updated table
output_file = "tutorialsYoutube/table_with_api_video_durations.csv"
df.to_csv(output_file, index=False)
print(f"Updated table saved to {output_file}")

Fetching video durations...


Summary of Video Durations:
Total number of unique videos: 17
Total duration (hh:mm:ss): 15:42:09
Updated table saved to tutorialsYoutube/table_with_api_video_durations.csv


# Tools Statistics


## Create new rows, one row for each EDAM term per tool

### EDAM Operation

In [9]:
inflated_tools = []

for idx, row in tools_df.iterrows():
    terms = row["EDAM reduced operations"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM operation single"] = val
            inflated_tools.append(row_c)
    else:
            row_c = copy.copy(row_d)
            row_c["EDAM operation single"] = "Undefined"
            inflated_tools.append(row_c)

df_inflated_tools_operation = pd.DataFrame(inflated_tools)

### EDAM Topics

In [10]:
inflated_tools = []

for idx, row in tools_df.iterrows():
    terms = row["EDAM reduced topics"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM topic single"] = val
            inflated_tools.append(row_c)
    else:
            row_c = copy.copy(row_d)
            row_c["EDAM topic single"] = "Undefined"
            inflated_tools.append(row_c)

df_inflated_tools_topics = pd.DataFrame(inflated_tools)

### Tools not suites

In [11]:
inflated_tools = []

for idx, row in tools_df.iterrows():
    terms = row["Tool IDs"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["tools per suite"] = val
            inflated_tools.append(row_c)
    else:
            row_c = copy.copy(row_d)
            row_c["tools per suite"] = "Undefined"
            inflated_tools.append(row_c)

df_inflated_tools_per_suite = pd.DataFrame(inflated_tools)
df_inflated_tools_per_suite

Unnamed: 0,Suite ID,Tool IDs,Description,Suite first commit date,Homepage,Suite version,Suite conda package,Latest suite conda package version,Suite version status,ToolShed categories,...,Suite runs (usegalaxy.fr),Suite runs on main servers,Suite runs (last 5 years) on main servers,Suite users on main servers,Suite users (last 5 years) on main servers,Related Workflows,Related Tutorials,To keep,Deprecated,tools per suite
0,AMRFinderPlus,amrfinderplus,"""AMRFinderPlus is designed to find acquired an...",2023-05-12,https://github.com/ncbi/amr,3.12.8,ncbi-amrfinderplus,4.0.23,To update,Sequence Analysis,...,774,19178,19178,865,865,https://usegalaxy.eu/published/workflow?id=1ce...,,True,False,amrfinderplus
1,ISEScan,isescan,"""ISEScan is a pipeline to identify IS (Inserti...",2022-09-01,https://github.com/xiezhq/ISEScan,1.7.2.3,isescan,1.7.3,To update,Sequence Analysis,...,909,69744,69744,499,499,https://dev.workflowhub.eu/workflows/1136?vers...,genome-annotation/bacterial-genome-annotation,True,False,isescan
2,abacas,abacas,Order and Orientate Contigs,2019-11-20,https://github.com/phac-nml/abacas,1.1,mummer,3.23,To update,Assembly,...,0,0,0,0,0,,,True,False,abacas
3,abricate,"abricate, abricate_list, abricate_summary",Mass screening of contigs for antiobiotic resi...,2016-07-29,https://github.com/tseemann/abricate,1.0.1,abricate,1.0.1,Up-to-date,Sequence Analysis,...,11594,1566054,1534207,8442,8126,https://dev.workflowhub.eu/workflows/1091?vers...,microbiome/pathogen-detection-from-nanopore-fo...,True,False,abricate
4,abricate,"abricate, abricate_list, abricate_summary",Mass screening of contigs for antiobiotic resi...,2016-07-29,https://github.com/tseemann/abricate,1.0.1,abricate,1.0.1,Up-to-date,Sequence Analysis,...,11594,1566054,1534207,8442,8126,https://dev.workflowhub.eu/workflows/1091?vers...,microbiome/pathogen-detection-from-nanopore-fo...,True,False,abricate_list
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,vsearch,"vsearch_alignment, vsearch_chimera_detection, ...","VSEARCH including searching, clustering, chime...",2014-11-30,https://github.com/torognes/vsearch,2.8.3,vsearch,2.30.0,To update,Sequence Analysis,...,31,76088,30608,868,562,https://usegalaxy.eu/published/workflow?id=401...,,True,False,vsearch_masking
883,vsearch,"vsearch_alignment, vsearch_chimera_detection, ...","VSEARCH including searching, clustering, chime...",2014-11-30,https://github.com/torognes/vsearch,2.8.3,vsearch,2.30.0,To update,Sequence Analysis,...,31,76088,30608,868,562,https://usegalaxy.eu/published/workflow?id=401...,,True,False,vsearch_search
884,vsearch,"vsearch_alignment, vsearch_chimera_detection, ...","VSEARCH including searching, clustering, chime...",2014-11-30,https://github.com/torognes/vsearch,2.8.3,vsearch,2.30.0,To update,Sequence Analysis,...,31,76088,30608,868,562,https://usegalaxy.eu/published/workflow?id=401...,,True,False,vsearch_shuffling
885,vsearch,"vsearch_alignment, vsearch_chimera_detection, ...","VSEARCH including searching, clustering, chime...",2014-11-30,https://github.com/torognes/vsearch,2.8.3,vsearch,2.30.0,To update,Sequence Analysis,...,31,76088,30608,868,562,https://usegalaxy.eu/published/workflow?id=401...,,True,False,vsearch_sorting


## Statistics

In [12]:
# Get the total number of tools
total_tools = tools_df.shape[0]

# Calculate total tool usage over the last 5 years, over all main servers
total_tool_usage = tools_df['Suite runs (last 5 years) on main servers'].sum()

# Calculate total number of tool users over the last 5 years, over all main servers
total_tool_users = tools_df['Suite users (last 5 years) on main servers'].sum()

# Calculate the average number of tool users over the last 5 years, over all main servers
average_tool_users = tools_df['Suite users (last 5 years) on main servers'].mean()

# Find the most used tool over the last 5 years, over all main servers
most_used_tool = tools_df.loc[tools_df['Suite runs (last 5 years) on main servers'].idxmax()]

# Find the tool with the most used over the last 5 years, over all main servers
most_users_tool = tools_df.loc[tools_df['Suite users (last 5 years) on main servers'].idxmax()]

total_tools, total_tool_usage, total_tool_users, average_tool_users, most_used_tool, most_users_tool

(306,
 21450922,
 651273,
 2128.343137254902,
 Suite ID                                                                                  fastp
 Tool IDs                                                                                  fastp
 Description                                       Fast all-in-one preprocessing for FASTQ files
 Suite first commit date                                                              2018-03-07
 Homepage                                                      https://github.com/OpenGene/fastp
                                                                     ...                        
 Suite users (last 5 years) on main servers                                                23121
 Related Workflows                             https://dev.workflowhub.eu/workflows/1064?vers...
 Related Tutorials                             assembly/assembly-with-preprocessing, assembly...
 To keep                                                                         

In [13]:
unique_tools = tools_df["Suite ID"].nunique()
unique_tools_operations = df_inflated_tools_operation["EDAM operation single"].nunique()
unique_tools_topics = df_inflated_tools_topics["EDAM topic single"].nunique()
unique_tools_per_suite = df_inflated_tools_per_suite["tools per suite"].nunique()

# Print statistics
print(f"Total number of tools: {unique_tools}")
print(f"Total number of unique EDAM operations: {unique_tools_operations}")
print(f"Total number of unique EDAM topics: {unique_tools_topics}")
print(f"Total number of unique tools: {unique_tools_per_suite}")

Total number of tools: 304
Total number of unique EDAM operations: 155
Total number of unique EDAM topics: 97
Total number of unique tools: 884


# Workflows Statistics

## Create new rows, one row for each EDAM term per workflow

### EDAM Operation

In [14]:
inflated_workflows_operation = []
for idx, row in workflows_df.iterrows():
    terms = row["EDAM operations"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM operation single"] = val
            inflated_workflows_operation.append(row_c)
    else:
        row_c = copy.copy(row_d)
        row_c["EDAM operation single"] = "Undefined"
        inflated_workflows_operation.append(row_c)

df_inflated_workflows_operation = pd.DataFrame(inflated_workflows_operation)

df_inflated_workflows_operation.head()

Unnamed: 0,Name,Source,ID,Link,Creators,Tags,Creation time,Update time,Latest version,Versions,Number of steps,Tools,EDAM operations,EDAM topics,License,DOI,Projects,To keep,Deprecated,EDAM operation single
0,allele-based-pathogen-identification/main,WorkflowHub,1063,https://workflowhub.eu/workflows/1063?version=5,"Engy Nasr, Bérénice Batut, Paul Zierep",,2025-03-26,2025-03-26,5,5,23.0,"Cut1, samtools_depth, snpSift_extractFields, C...","Pairwise sequence alignment, Variant calling",,MIT,,Intergalactic Workflow Commission (IWC),True,False,Pairwise sequence alignment
1,allele-based-pathogen-identification/main,WorkflowHub,1063,https://workflowhub.eu/workflows/1063?version=5,"Engy Nasr, Bérénice Batut, Paul Zierep",,2025-03-26,2025-03-26,5,5,23.0,"Cut1, samtools_depth, snpSift_extractFields, C...","Pairwise sequence alignment, Variant calling",,MIT,,Intergalactic Workflow Commission (IWC),True,False,Variant calling
2,mgnify-amplicon-pipeline-v5-complete/main,WorkflowHub,1274,https://workflowhub.eu/workflows/1274?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,20.0,", fastq_dl, __MERGE_COLLECTION__, tp_awk_tool,...",,,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Undefined
3,mgnify-amplicon-pipeline-v5-its/main,WorkflowHub,1273,https://workflowhub.eu/workflows/1273?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,30.0,"collection_element_identifiers, , bedtools_mas...","Visualisation, Formatting, k-mer counting, Map...",,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Visualisation
4,mgnify-amplicon-pipeline-v5-its/main,WorkflowHub,1273,https://workflowhub.eu/workflows/1273?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,30.0,"collection_element_identifiers, , bedtools_mas...","Visualisation, Formatting, k-mer counting, Map...",,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Formatting


### EDAM topics

In [15]:
inflated_workflows_topics = []
for idx, row in workflows_df.iterrows():
    terms = row["EDAM topics"]
    row_d = row.to_dict()
    if isinstance(terms, str):
        vals = [t.strip() for t in terms.split(",")]
        for val in vals:
            row_c = copy.copy(row_d)
            row_c["EDAM topic single"] = val
            inflated_workflows_topics.append(row_c)
    else:
        row_c = copy.copy(row_d)
        row_c["EDAM topic single"] = "Undefined"
        inflated_workflows_topics.append(row_c)

df_inflated_workflows_topics = pd.DataFrame(inflated_workflows_topics)

df_inflated_workflows_topics.head()

Unnamed: 0,Name,Source,ID,Link,Creators,Tags,Creation time,Update time,Latest version,Versions,Number of steps,Tools,EDAM operations,EDAM topics,License,DOI,Projects,To keep,Deprecated,EDAM topic single
0,allele-based-pathogen-identification/main,WorkflowHub,1063,https://workflowhub.eu/workflows/1063?version=5,"Engy Nasr, Bérénice Batut, Paul Zierep",,2025-03-26,2025-03-26,5,5,23.0,"Cut1, samtools_depth, snpSift_extractFields, C...","Pairwise sequence alignment, Variant calling",,MIT,,Intergalactic Workflow Commission (IWC),True,False,Undefined
1,mgnify-amplicon-pipeline-v5-complete/main,WorkflowHub,1274,https://workflowhub.eu/workflows/1274?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,20.0,", fastq_dl, __MERGE_COLLECTION__, tp_awk_tool,...",,,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Undefined
2,mgnify-amplicon-pipeline-v5-its/main,WorkflowHub,1273,https://workflowhub.eu/workflows/1273?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,30.0,"collection_element_identifiers, , bedtools_mas...","Visualisation, Formatting, k-mer counting, Map...",,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Undefined
3,mgnify-amplicon-pipeline-v5-quality-control-pa...,WorkflowHub,1272,https://workflowhub.eu/workflows/1272?version=2,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,2,2,17.0,"tp_find_and_replace, fastp, cshl_fasta_formatt...","Read pre-processing, Sequence contamination fi...",,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Undefined
4,mgnify-amplicon-pipeline-v5-quality-control-si...,WorkflowHub,1271,https://workflowhub.eu/workflows/1271?version=3,"Rand Zoabi, Paul Zierep",,2025-03-26,2025-03-26,3,3,14.0,"tp_find_and_replace, cshl_fasta_formatter, fas...","Sequence composition calculation, Read pre-pro...",,Apache-2.0,,Intergalactic Workflow Commission (IWC),True,False,Undefined


## Statistics

In [16]:
unique_workflows = workflows_df["Name"].nunique()
unique_workflows_operations = df_inflated_workflows_operation["EDAM operation single"].nunique()
unique_workflows_topics = df_inflated_workflows_topics["EDAM topic single"].nunique()

# Print statistics
print(f"Total number of workflows: {unique_workflows}")
print(f"Total number of unique EDAM operations: {unique_workflows_operations}")
print(f"Total number of unique EDAM topics: {unique_workflows_topics}")

Total number of workflows: 105
Total number of unique EDAM operations: 109
Total number of unique EDAM topics: 9
