In [None]:
#DOWNLOAD ALL WORKFLOW DATA
import os
import requests
import time
import constants

# Description:
# This function is used to download all public workflows 
# from a website and each workflow will be saved in the data\\raw_galaxy_files folder 
# with the .ga extension. If the workflow has already been downloaded, 
# it will not be downloaded again.
def download(source):
    # Download all public/shared workflows from the given usegalaxy source
    out_dir = os.path.join(constants.RAW_GALAXY_FILES, source.replace(".", "_"))
    os.makedirs(out_dir, exist_ok=True)
    
    # Don't download workflows that are already cached
    existing_files = [os.path.splitext(file)[0] for file in os.listdir(out_dir)]
    
    # Get list of workflows from server
    workflow_list_url = f"https://{source}/api/workflows"
    print("Downloading workflow list from ", workflow_list_url)
    workflow_list = requests.get(workflow_list_url).json()
    
    to_download = []
    for workflow in workflow_list:
        if workflow["id"] not in existing_files:
            to_download.append(workflow["id"])
    
    print(f"Downloading {len(to_download)} workflows from {source}")
    
    # Download with respect to the API wait time specified in constants
    prev_time = 0
    num = 0
    for workflow_id in to_download:
        num += 1
        
        download_url = f"https://{source}/api/workflows/{workflow_id}/download"
        
        while (time.time() - prev_time < constants.GALAXY_API_WAIT):
            time.sleep(0.1)
        
        prev_time = time.time()
        print(f"Downloading {num}/{len(to_download)}: {workflow_id}")
        r = requests.get(download_url)
        
        if r.status_code != 200:
            print(f"Error downloading {workflow_id}")
            print("Skipping...")
            continue
        
        with open(os.path.join(out_dir, f"{workflow_id}.ga"), "w") as f:
            f.write(r.text)

# Description:
#download all public workflow for each website
def run():
    print("Downloading workflows from AllGalaxy sources")
    with open(constants.WORKFLOW_SOURCES) as f:
        sources = f.read().splitlines()
    
    for source in sources:
        download(source)
    
    print("Done downloading workflows")
run()

In [None]:
#PREPARE EACH WORKFLOW 
import os
import tqdm

import constants
import utils


#Description :
# This function is used to process nodes and edges for each workflow if step["type"] == "subworkflow"
def process_subworkflow(subworkflow, nodes, edges, id_map, next_id):
    sub_nodes = []
    sub_edges = {}

    sub_id_map = {}
    sub_next_id = 0

    for step in list(subworkflow["subworkflow"]["steps"]):
        step = subworkflow["subworkflow"]["steps"][step]
        sub_nodes, sub_edges, sub_id_map, sub_next_id = process_step(step, sub_nodes, sub_edges, sub_id_map, sub_next_id)
    
    nodes.extend(sub_nodes)

    for from_sub_edge, to_sub_edges in sub_edges.items():
        from_sub_edge += next_id
        for to_sub_edge in to_sub_edges:
            to_sub_edge += next_id
            if from_sub_edge not in edges:
                edges[from_sub_edge] = [to_sub_edge]
            else:
                edges[from_sub_edge].append(to_sub_edge)
    
    id_map[subworkflow["id"]] = next_id + sub_next_id - 1

    for in_connection in list(subworkflow["input_connections"]):
        in_connection = subworkflow["input_connections"][in_connection]

        if type(in_connection) != list:
            in_connection = [in_connection]
        for connection in in_connection:
            from_id = id_map[connection["id"]]
            to_id = connection["input_subworkflow_step_id"] + next_id
            if from_id not in edges:
                edges[from_id] = [to_id]
            else:
                edges[from_id].append(to_id)
    
    next_id += sub_next_id

    return nodes, edges, id_map, next_id


#Description :
# This function is used to process nodes and edges for each workflow.
def process_step(step, nodes, edges, id_map, next_id):
    curr_step = {}

    this_id = next_id
    
    # Extract relevant information from step to build node
    curr_step["type"] = step["type"]
    if "tool_state" in step and step["tool_state"] is not None:
        curr_step["tool_state"] = step["tool_state"]
    curr_step["annotation"] = step["annotation"]
    if "label" in step and step["label"] is not None:
        curr_step["label"] = step["label"]
    curr_step["name"] = step["name"]
    
    # Galaxy workflows can contain subworkflows (that may contain subworkflows, etc.) so we need to recursively process them
    if step["type"] == "subworkflow":
        nodes, edges, id_map, next_id = process_subworkflow(step, nodes, edges, id_map, next_id)
    else:
        # We have a single step, associate the in-edges
        for in_connection in list(step["input_connections"]):
            in_connection = step["input_connections"][in_connection]
            
            if type(in_connection) != list:
                in_connection = [in_connection]
            for connection in in_connection:
                from_id = id_map[connection["id"]]
                if from_id not in edges:
                    edges[from_id] = [this_id]
                else:
                    edges[from_id].append(this_id)
        
        # Get tool information
        if step["type"] != "data_input" and step["type"] != "data_collection_input" and step["type"] != "parameter_input":
            curr_step["tool_id"] = step["tool_id"]
            if "tool_shed_repository" in step:
                curr_step["tool_name"] = step["tool_shed_repository"]["name"]
                curr_step["tool_shed"] = step["tool_shed_repository"]["tool_shed"]
            
        nodes.append(curr_step)
        id_map[step["id"]] = this_id
        next_id += 1
    
    return nodes, edges, id_map, next_id

#Description :
# This function serves to create a dictionary containing name, 
# tags, license, node, server, and workflow ID information for each workflow.
def process_file(file_path):
    workflow = utils.load_json(file_path)
    
    # Associate relevant information
    name = workflow["name"]
    tags = workflow["tags"]
    license = workflow["license"] if "license" in workflow else None
    split_path = os.path.split(file_path)
    workflow_id = split_path[1].split(".")[0]
    server = os.path.split(split_path[0])[1]

    nodes = []
    edges = {}

    id_map = {}
    next_id = 0
    
    # Process each step to build list of nodes and edges
    try:
        for step in list(workflow["steps"]):
            step = workflow['steps'][step]
            nodes, edges, id_map, next_id = process_step(step, nodes, edges, id_map, next_id)
    except:
        print(f"Error processing file: {file_path}  Step: {step['id']}")
        
    
    return {"name": name, "tags": tags, "license": license, "nodes": nodes, "edges": edges, "server": server, "workflow_id": workflow_id}


#Description :
# This function is used to create json containing information 
# data for each workflow. Each workflow will have data consisting of name, 
# tags, license, node, server, and workflow ID. 
# The node contains step sequence data, 
# which consists of type, tool_state, annotation, 
# label, name, tool_id, tool_name, tool_shed information.
# edges contain the relationships between steps. A step can have one or more than one input.

def run():
    workflows = []
    num_nodes = 0
    num_edges = 0
    print("Processing Galaxy files for AllGalaxy dataset...")
    galaxy_files = utils.get_files_of_ext(constants.RAW_GALAXY_FILES, "ga")
    for file in tqdm.tqdm(galaxy_files):
        print(file)
        workflow = process_file(file)
        num_nodes += len(workflow["nodes"])
        for from_edge, to_edges in workflow["edges"].items():
            num_edges += len(to_edges)
        workflows.append(workflow)
    
    utils.dump_json(workflows, os.path.join(constants.PROCESSED_WORKFLOWS_LOC, "all_galaxy.json"))
    
    print("Done processing.")
    print(f"Number of workflows: {len(workflows)}")
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")


run()

In [None]:
#Downloading tool names and descriptions from the toolshed.g2.bx.psu.edu site
import requests
import os
import time
import tqdm
import constants
import utils


#Description :
#This function is used to download the tool list from the toolshed.g2.bx.psu.edu site.
#This function will produce information for each tool consisting of information processed, 
#name, description, and downloads. The results will be saved in json form.
def init_repositories():
    # First, we need to download the list of repositories from the Galaxy ToolShed
    if os.path.exists(constants.PROCESSED_REPOSITORIES):
        processed_repositories = utils.load_json(constants.PROCESSED_REPOSITORIES)
    else:
        processed_repositories = {}
    
    with open(constants.TOOL_SOURCES) as f:
        sources = f.read().splitlines()
    
    # Concatenate sources and prepare for download
    for source in sources:
        source_repositories_url = f"https://{source}/api/repositories"
        print("Downloading repository list from ", source_repositories_url)
        source_repositories = requests.get(source_repositories_url).json()
        
        for repository in tqdm.tqdm(source_repositories, desc=f"Preparing {source}"):
            id = f"{source} {repository['id']}"
            if id not in processed_repositories:
                processed_repositories[id] = {}
                processed_repositories[id]["processed"] = False
                
                processed_repositories[id]["name"] = repository["name"]
                if "description" in repository:
                    processed_repositories[id]["description"] = repository["description"]
                processed_repositories[id]["downloads"] = repository["times_downloaded"]
    
    utils.dump_json(processed_repositories, constants.PROCESSED_REPOSITORIES)



#Description :
# This function adds information for each tool from the json results
# produced by the init_repositories function, by adding information for tools 
# consisting of id, name, profile, and other information.
def download_repositories():
    # Now we download all tool repositories and extract the tools from them
    # Respecting the API wait time specified in constants
    # See Toolshed API for structure of metadata
    i = 0
    prev_time = 0
    processed_repositories = utils.load_json(constants.PROCESSED_REPOSITORIES)
    for id, repository in processed_repositories.items():
        i += 1
        if repository["processed"]:
            continue
        
        print("Processing repository {}/{}: {}".format(i, len(processed_repositories), repository["name"]))
        
        source, actual_id = id.split(" ")
        metadata_url = f"https://{source}/api/repositories/{actual_id}/metadata"
        
        while (time.time() - prev_time < constants.GALAXY_API_WAIT):
            time.sleep(0.1)
        
        metadata = requests.get(metadata_url).json()
        prev_time = time.time()
        
        if len(metadata) == 0 or "err_msg" in metadata:
            print("No metadata found, skipping...")
            continue
        
        # Get description(s) from latest version of repository
        latest_revision = list(metadata.values())[-1]
        if "tools" in latest_revision:
            repository["tools"] = latest_revision["tools"]
            for i_tool in range(len(repository["tools"])):
                tool = repository["tools"][i_tool]
                if "tests" in tool:
                    del repository["tools"][i_tool]["tests"]
        repository["processed"] = True
        utils.dump_json(processed_repositories, constants.PROCESSED_REPOSITORIES)

def run():
    print("Downloading tool names and descriptions from Toolshed sources")
    init_repositories()
    download_repositories()
    print("Done downloading tool descriptions")

In [None]:
#CREATE TOOL LIST
import tqdm
import constants
import utils

#Description :
#This function will retrieve information from processed repositories for each tool consisting of tool descriptions and download information.
def collect_tools_from_repos():
    repositories = utils.load_json(constants.PROCESSED_REPOSITORIES)
    
    tools = {}
    
    for repository in tqdm.tqdm(repositories.values(), desc="Retrieving descriptions"):
        if not repository["processed"]:
            continue
    
        if "tools" not in repository:
            continue
        
        # Obtain descriptions for all tools from processed repository
        for tool in repository["tools"]:
            # If there is only one tool in the repository, often use the repository description as seen from data
            if len(repository["tools"]) == 1:
                if tool["description"] == "":
                    desc = tool["name"] + ". " + repository["description"]
                else:
                    desc = tool["name"] + " " + tool["description"] + ". " + repository["description"]
            else:
                desc = tool["name"] + " " + tool["description"]
            
            desc = desc.strip()
            id = tool["id"]
            processed_id = id.replace("/", "_") # Some tools have a slash in their id, but it is not a guid at this point
            
            # Some tools have same name, take description of one with most downloads
            if processed_id in tools:
                if tools[processed_id]["downloads"] >= repository["downloads"]:
                    continue
            
            tools[processed_id] = { "description": desc, "downloads": repository["downloads"] }
    
    return tools

#Description :
#This function will add information if information for a tool with a certain ID is not available in the toolshed repository.
def add_extra_tools(tools):
    # Not all tools in workflows are present in Toolshed repositories, so we add them here
    processed_workflows = utils.get_files_of_ext(constants.PROCESSED_WORKFLOWS_LOC, "json")
    
    for file in processed_workflows:
        workflows = utils.load_json(file)
        
        for workflow in tqdm.tqdm(workflows, desc="Adding extras"):
            for node in workflow["nodes"]:
                id = utils.get_node_id(node)
                
                if id in tools:
                    continue
                
                tools[id] = { "description": node["name"], "downloads": 0 }
    
    return tools

#Description :
#this function will remove some punctuation and also reduce all words to lowercase

def preprocess_descriptions(tools):
    # Remove punctuation and make all lowercase
    for id in tqdm.tqdm(tools, desc="Preprocessing descriptions"):
        desc = tools[id]["description"]
        desc = desc.replace("(", " ").replace(")", " ").replace(",", " ").replace(":", " ").replace(";", " ").replace("-", " ").replace("_", " ").replace("/", " ").replace("\\", " ").replace("  ", " ")
        tools[id]["description"] = desc.lower().strip()
    return tools

def create_tool_list():
    # Create toolbox from downloaded tool repositories
    tools = collect_tools_from_repos()
    tools = add_extra_tools(tools)
    tools = preprocess_descriptions(tools)
    utils.dump_json(tools, constants.TOOL_LIST)
create_tool_list()

In [None]:
#EMBED Description()
from sentence_transformers import SentenceTransformer
import constants
import utils


#Description:
#This function is used to encode the description of each tool using the PubMedBERT model and save the result

def run():
    tools = utils.load_json(constants.TOOL_LIST)
    
    print("Loading PubMedBERT model...")
    model = SentenceTransformer('pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb')
    
    descriptions = [tools[id]["description"] for id in tools]
    
    print("Embedding tool descriptions...")
    embeddings = model.encode(descriptions, show_progress_bar=True)
    
    utils.dump_numpy(embeddings, constants.DESCRIPTION_EMBEDDINGS)
run()

In [None]:
model_base_loc = os.path.join(constants.OUT_LOC, constants.MODEL_NAME)
optimize_base_loc = os.path.join(constants.OUT_LOC, "{}_optimize".format(constants.MODEL_NAME))
base_config = {
        "device": "cuda",
        "model_type": constants.MODEL_TYPE,
        "hidden_channels": 32,
        "learning_rate": 0.001,
        "l2_penalty": 0.00001,
        "step_size": 30,
        "weight_decay": 0.1,
        "emb_dropout": 0.0,
        "dropout": 0.0,
        "epochs": 100,
        "batch_size": 100,
        "model_path": optimize_base_loc,
        "model_name": "model.pt",
        "top_k": constants.HITRATE_K,
        "mrr_k": constants.MRR_K,
    }

In [None]:
#Prepare data for training, testing, and validation
import networkx as nx
import random
import os
import tqdm

import constants
import utils

#Description :
#perform filters on the workflow
def filter_workflows(workflows):
    # Filter out workflows that are too short or too long
    return [workflow for workflow in workflows if len(workflow["nodes"]) >= constants.MIN_WORKFLOW_LENGTH and len(workflow["nodes"]) <= constants.MAX_WORKFLOW_LENGTH]

#Description :
#This function is used to build a list of tools in the toolbox
def build_tool_list(workflows, toolbox):
    tool_name_to_id = {}
    tool_id_to_type = {}
    next_id = 0
    
    # Associate each tool with a unique ID, and ensure that all tools are in the toolbox
    for workflow in tqdm.tqdm(workflows, desc="Building tool list"):
        for node in workflow["nodes"]:
            id = utils.get_node_id(node)
            t = utils.get_node_type(node)
            
            if id not in toolbox:
                raise Exception(f"Tool {id} not found in toolbox. Please process the toolbox first.")
            
            if id not in tool_name_to_id:
                tool_name_to_id[id] = next_id
                tool_id_to_type[next_id] = t
                next_id += 1
    
    return tool_name_to_id, tool_id_to_type

#Description :
#This function is used for Flattening tool list
def flatten_to_type(tool_name_to_id, tool_id_to_type, types=["tool"]):
    # Remove nodes that are not of the specified types, generally all but tools
    tool_name_to_id_flattened = {}
    tool_id_to_type_flattened = {}
    next_id = 0
    
    # Also flatten the associated dictionaries
    for tool_name, tool_id in tqdm.tqdm(tool_name_to_id.items(), desc="Flattening tool list"):
        if tool_id_to_type[tool_id] in types:
            tool_name_to_id_flattened[tool_name] = next_id
            tool_id_to_type_flattened[next_id] = tool_id_to_type[tool_id]
            next_id += 1
    
    return tool_name_to_id_flattened, tool_id_to_type_flattened


#Description :
#This function creates a directed graph for the workflow, with nodes representing tools and edges representing connections between tools.
def build_workflow_dag(workflow, tool_name_to_id, toolbox, types=["tool"]):
    G = nx.DiGraph()
    
    # Map nodes with tool IDs and description embeddings
    node_index = 0
    for node in workflow["nodes"]:
        tool_type = utils.get_node_type(node)
        if tool_type not in types:
            node_index += 1
            continue
        
        tool_name = utils.get_node_id(node)
        G.add_node(node_index, tool_id=tool_name_to_id[tool_name], tool_type=tool_type, embedding=toolbox[tool_name]["embedding"])
        node_index += 1
    
    # Map edges between nodes
    for from_node in workflow["edges"]:
        from_node_int = int(from_node) # JSON keys are strings
        for to_node in workflow["edges"][from_node]:
            to_node_string = str(to_node) # JSON values are ints
            if utils.get_node_type(workflow["nodes"][from_node_int]) not in types:
                continue
            if utils.get_node_type(workflow["nodes"][to_node]) not in types:
                # Check if bypass edge should be added
                for to_node2 in workflow["edges"][to_node_string]:
                    to_node2
                    if utils.get_node_type(workflow["nodes"][to_node2]) not in types:
                        continue
                    G.add_edge(from_node_int, to_node2)
            else:
                G.add_edge(from_node_int, to_node)
    
    return G


#Description :
#This function creates a directed graph for the workflow
def build_unique_workflow_dags(workflows, tool_name_to_id, toolbox, types=["tool"]):
    # Use graph hash to filter out duplicates
    graph_hashes = []
    unique_graph_dicts = []
    
    for workflow in tqdm.tqdm(workflows, desc="Building workflow DAGs"):
        G = build_workflow_dag(workflow, tool_name_to_id, toolbox, types)
        graph_hash = nx.weisfeiler_lehman_graph_hash(G, node_attr="tool_id") # Strong guarantee of uniqueness between non-isomorphic graphs
        
        # Only keep unique graphs
        if graph_hash not in graph_hashes:
            graph_hashes.append(graph_hash)
            graph_dict = { "graph": G }
            
            # Other information that can help inspection of results, not present in EuGalaxy dataset
            if "name" in workflow:
                graph_dict["name"] = workflow["name"]
            if "server" in workflow:
                graph_dict["server"] = workflow["server"]
            if "workflow_id" in workflow:
                graph_dict["workflow_id"] = workflow["workflow_id"]
            unique_graph_dicts.append(graph_dict)
    return unique_graph_dicts


#Description :
#to get the sequence of the graph
def get_all_dag_paths(G):
    # Get all linear sequences in a DAG from all sources to all sinks
    sources = [node for node in G.nodes if G.in_degree(node) == 0]
    sinks = [node for node in G.nodes if G.out_degree(node) == 0]
    
    paths = []
    for source in sources:
        for sink in sinks:
            paths.extend(nx.all_simple_paths(G, source, sink))
    
    return paths


#Description :
#to build workflow path sequence with the number is at least 2
def build_workflow_paths_sequence(workflow_dict):
    G = workflow_dict["graph"]
    paths = get_all_dag_paths(G)
    path_sequences = []
    for path in paths:
        path_sequence = []
        
        # Skip paths that are too short to have an input and ground truth
        if len(path) < 2:
            continue
        
        for node in path:
            path_sequence.append((G.nodes[node]["tool_id"], G.nodes[node]["embedding"]))
        
        path_sequences.append(path_sequence)
    
    return path_sequences
#Description :
#to split workflow data into train data, validation data, and testing data
def split_data(workflow_dicts):
    # Split workflows randomly into train, test, val splits as specified in constants
    seed = random.randint(0, 1000000)
    workflow_ids = list(range(len(workflow_dicts)))
    random.Random(seed).shuffle(workflow_ids)
    
    train_ids = workflow_ids[:int(len(workflow_ids) * constants.TRAIN_SPLIT)]
    val_ids = workflow_ids[int(len(workflow_ids) * constants.TRAIN_SPLIT):int(len(workflow_ids) * (constants.TRAIN_SPLIT + constants.VAL_SPLIT))]
    test_ids = workflow_ids[int(len(workflow_ids) * (constants.TRAIN_SPLIT + constants.VAL_SPLIT)):]
    
    train_data = [workflow_dicts[i] for i in train_ids]
    val_data = [workflow_dicts[i] for i in val_ids]
    test_data = [workflow_dicts[i] for i in test_ids]
    
    return train_data, val_data, test_data

def get_partial_graphs(G):
    # Get all sub-graphs for a given graph, where each sub-graph is the reverse DFS tree of a node
    partial_graphs = []
    for node in G.nodes:
        # Use reverse DFS to get partial graphs
        if G.in_degree(node) == 0:
            continue
        
        reverse_dfs_tree = nx.dfs_tree(G.reverse(), node).reverse()
        reverse_dfs_tree.add_nodes_from((n, G.nodes[n]) for n in reverse_dfs_tree.nodes)
        reverse_dfs_tree.remove_node(node) # This is the ground truth, which will become masked node at "recommendation position".
        
        partial_graphs.append({ "graph": reverse_dfs_tree, "y": G.nodes[node]["tool_id"] })
    
    return partial_graphs

def get_partial_paths(path_sequences):
    # Get all partial paths for a given path sequence, where each partial path is a sub-sequence of the path length 2 to n
    partial_paths = []
    unique_paths = []
    for path in path_sequences:
        for i in range(2, (len(path) + 1)):
            path_tuple = tuple([x[0] for x in path[:i]])
            if path_tuple not in unique_paths:
                unique_paths.append(path_tuple)
                partial_paths.append(path[:i])
    
    return partial_paths

#Description :
#to prepare data for training, testing and validation.
def prepare(workflows_path, data_output_path):
    # Driver function to prepare data for models
    
    # Load workflows and toolbox
    print("Preparing data for models...")
    workflows = utils.load_json(workflows_path)
    toolbox, embedding_size = utils.load_toolbox()
    
    # Filter workflows and build tool list
    print("Length of workflows before filtering: {}".format(len(workflows)))
    workflows = filter_workflows(workflows)
    print("Length of workflows after filtering: {}".format(len(workflows)))
    tool_name_to_id, tool_id_to_type = build_tool_list(workflows, toolbox)
    tool_name_to_id, tool_id_to_type = flatten_to_type(tool_name_to_id, tool_id_to_type)
    
    num_tools = len(tool_name_to_id)
    
    # Build unique workflow DAGs
    workflow_dicts = build_unique_workflow_dags(workflows, tool_name_to_id, toolbox)
    
    # Build paths for each workflow DAG
    for workflow_dict in workflow_dicts:
        workflow_dict["path_sequences"] = build_workflow_paths_sequence(workflow_dict)
    
    # Split data into train, val, test
    train_data, val_data, test_data = split_data(workflow_dicts)
    
    # Build partial graphs and paths for each workflow DAG
    for data in [train_data, val_data, test_data]:
        for workflow_dict in tqdm.tqdm(data, desc="Building partial graphs and paths"):
            workflow_dict["partial_graphs"] = get_partial_graphs(workflow_dict["graph"])
            workflow_dict["partial_paths"] = get_partial_paths(workflow_dict["path_sequences"])
    
    # Save data
    raw_output_path = os.path.join(data_output_path, "raw")
    print("Saving data...")
    utils.dump_pickle(train_data, os.path.join(raw_output_path, "train_data.pickle"))
    utils.dump_pickle(val_data, os.path.join(raw_output_path, "val_data.pickle"))
    utils.dump_pickle(test_data, os.path.join(raw_output_path, "test_data.pickle"))
    
    info = { "num_tools": num_tools, "tool_name_to_id": tool_name_to_id, "embedding_size": embedding_size }
    utils.dump_json(info, os.path.join(data_output_path, "info.json"))
    
    print("Done processing.")

    
def prepare_data_splits(model_base_loc, optimize_base_loc):
    ### PREPARE DATA AND SPLITS
    prepare_data.prepare(os.path.join(constants.PROCESSED_WORKFLOWS_LOC, "{}.json".format(constants.MODEL_DATA)), model_base_loc)
    prepare_data.prepare(os.path.join(constants.PROCESSED_WORKFLOWS_LOC, "{}.json".format(constants.MODEL_DATA)), optimize_base_loc)

    
model_base_loc = os.path.join(constants.OUT_LOC, constants.MODEL_NAME)
optimize_base_loc = os.path.join(constants.OUT_LOC, "{}_optimize".format(constants.MODEL_NAME))

prepare_data_splits(model_base_loc, optimize_base_loc)

In [None]:
#OPTIMIZATION FOR HYPERPARAMETER
import hyperopt
import numpy as np
import os
import utils
import data_loader
import constants
import model

i = 0
#Description :
#to optimize hyperparameter dengan Bayesian optimization, to find the set of hyperparameters that minimizes loss.
def optimize(config, ranges, num_evals, output_name="best_hyperparameters.json"):
    params = config.copy()
    
    # Define hyperparaments and ranges
    params["hidden_channels"] = hyperopt.hp.quniform("hidden_channels", ranges["hidden_channels"][0], ranges["hidden_channels"][1], 1)
    params["learning_rate"] = hyperopt.hp.loguniform("learning_rate", np.log(ranges["learning_rate"][0]), np.log(ranges["learning_rate"][1]))
    params["l2_penalty"] = hyperopt.hp.loguniform("l2_penalty", np.log(ranges["l2_penalty"][0]), np.log(ranges["l2_penalty"][1]))
    params["step_size"] = hyperopt.hp.quniform("step_size", ranges["step_size"][0], ranges["step_size"][1], 1)
    params["emb_dropout"] = hyperopt.hp.uniform("emb_dropout", ranges["emb_dropout"][0], ranges["emb_dropout"][1])
    params["dropout"] = hyperopt.hp.uniform("dropout", ranges["dropout"][0], ranges["dropout"][1])
    params["batch_size"] = hyperopt.hp.quniform("batch_size", ranges["batch_size"][0], ranges["batch_size"][1], 1)
    params["epochs"] = hyperopt.hp.quniform("epochs", ranges["epochs"][0], ranges["epochs"][1], 1)
    
    def objective(params):
        global i
        i += 1
        model_config = params.copy()
        
        # Convert some parameters back to int
        model_config["hidden_channels"] = int(model_config["hidden_channels"])
        model_config["step_size"] = int(model_config["step_size"])
        model_config["batch_size"] = int(model_config["batch_size"])
        model_config["epochs"] = int(model_config["epochs"])
        
        print("Hyperparameter Optimize Iteration: {}".format(i))
        print("Hyperparameters: {}".format(model_config))
        
        model_config = data_loader.add_data_config(model_config)
        best_model, best_epoch, best_acc, val_accs, losses = model.train_model(model_config, use_tqdm=False)
        return min(losses)
    
    trials = hyperopt.Trials()
    best = hyperopt.fmin(objective, params, algo=hyperopt.tpe.suggest, max_evals=num_evals, trials=trials)
    utils.dump_json(best, os.path.join(config["model_path"], output_name))
    
    return best
def optimize_hyperparameters(base_config):
    ## OPTIMIZE HYPERPARAMETERS OVER 10 ITERATIONS
    ranges = {
        "hidden_channels": [16, 64],
        "learning_rate": [0.0001, 0.01],
        "l2_penalty": [0.00001, 0.01],
        "step_size": [10, 30],
        "emb_dropout": [0.0, 0.5],
        "dropout": [0.0, 0.5],
        "batch_size": [32, 128],
        "epochs": [50, 100],
    }
    
    num_evals = constants.NUM_OPTIMIZE_ITERATIONS
    optimize_hyper.optimize(base_config, ranges, num_evals, "best_hyperparameters.json")
optimize_hyperparameters(base_config)

In [None]:
#MODEL TRAINING

#Description :
#This function is used to train the model using the best hyperparameters obtained from the previous stage.
def train_model(base_config, model_base_loc, optimize_base_loc):
    ## TRAIN ONE MODEL
    config = base_config.copy()
    best_params = utils.load_json('best_hyperparameters.json')
    
    for key, value in best_params.items():
        config[key] = value
    
    config["hidden_channels"] = int(config["hidden_channels"])
    config["step_size"] = int(config["step_size"])
    config["batch_size"] = int(config["batch_size"])
    config["epochs"] = int(config["epochs"])
    config["model_path"] = model_base_loc
    config = data_loader.add_data_config(config)
    
    best_model, best_epoch, best_acc, val_accs, losses = model.train_model(config)
    print("Best Epoch: {}, Best Val Acc: {}".format(best_epoch, best_acc))
    model.save_model(best_model, config)
train_model(base_config, model_base_loc, optimize_base_loc)

In [None]:
##MODEL Testing

#Description :
#This function is used to test the model, and will calculate the accuracy metrics HR1, HR2, and MRR.
def test_model(base_config, model_base_loc, optimize_base_loc):
    # TEST ONE MODEL
    config = base_config.copy()
    best_params = utils.load_json(os.path.join(optimize_base_loc, "best_hyperparameters.json"))
    
    for key, value in best_params.items():
        config[key] = value
    
    config["hidden_channels"] = int(config["hidden_channels"])
    config["step_size"] = int(config["step_size"])
    config["batch_size"] = int(config["batch_size"])
    config["epochs"] = int(config["epochs"])
    config["model_path"] = model_base_loc
    config = data_loader.add_data_config(config)
    
    if config["model_type"] == "graph":
        test_dataset = data_loader.GraphDataset(root=config["model_path"], name="test_data")
    else:
        test_dataset = data_loader.PathDataset(root=config["model_path"], name="test_data")
    
    test_loader = torch_geometric.loader.DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)
    best_model = model.load_model(config)
    acc, top_k, mrr_k, by_length = model.evaluate_model(best_model, test_loader, config)
    print("HR1: {}, HR3: {}, MRR: {}".format(acc*100, top_k*100, mrr_k*100))
test_model(base_config, model_base_loc, optimize_base_loc)

In [None]:
#CREATE RECOMMENDATION
import os
import pickle
import pandas as pd
import networkx as nx
import numpy as np
from fuzzywuzzy import fuzz

import torch
import torch_geometric

import constants
import model
import utils
import data_loader


#Description :
#This function is used to processes a graph path, associates embeddings with nodes.
def path_to_data(path):
    x = path
    
    steps = [node[0] for node in x]
    id_to_embedding = {node[0]: node[1] for node in x}
    indices, x = pd.factorize(steps)
    
    senders, receivers = indices[:-1], indices[1:]
    
    # Associate embedding with tool instance in sequence
    full_x = []
    for item in x:
        seq = [item]
        seq.extend(id_to_embedding[item])
        full_x.append(seq)
    
    full_x = torch.tensor(full_x, dtype=torch.float)
    edge_index = torch.tensor([senders, receivers], dtype=torch.long)

    data = torch_geometric.data.Batch.from_data_list([torch_geometric.data.Data(x=full_x, edge_index=edge_index)])
    
    return data

#Description :
#This function is used to evaluate model on some input data
def predict(model, data):
    model.eval()
    with torch.no_grad():
        logits = model(data)
        return logits

#Description :
#check whether each element in the sequence entered by the user is recognized by the tool name list.
#If it is not recognized, it will recommend 10 other tools that are similar to that element.
def validate_sequence(input_sequence, all_tool_names):
    sequence = [tool.strip() for tool in input_sequence]
    
    if len(sequence) == 0:
        print("No sequence provided")
        return []
    
    true_sequence = []
    for tool in sequence:
        if tool in all_tool_names:
            true_sequence.append(tool)
        else:
            distances = {}
            for name in all_tool_names:
                distances[name] = fuzz.ratio(name, tool)
            
            sorted_candidates = sorted(distances.items(), key=lambda x: x[1], reverse=True)
            
            print(f"Tool {tool} not found in workflow, showing {constants.CANDIDATES_TO_SHOW} potential matches:")
            for i in range(constants.CANDIDATES_TO_SHOW):
                print(f"{i + 1}. {sorted_candidates[i][0]}")
            
            if (constants.MATCH_INTERACTIVELY):
                print("0 to exit")
                print("Please select a tool to use:")
                
                while True:
                    try:
                        selection = int(input())
                        if selection == 0:
                            return []
                        elif selection > 0 and selection <= constants.CANDIDATES_TO_SHOW:
                            true_sequence.append(sorted_candidates[selection - 1][0])
                            break
                    except:
                        print("Invalid input, please try again")
            else:
                return []

#Description:
#To convert sequence entered by the user
def convert_sequence(sequence, tool_name_to_id, toolbox):
    converted_sequence = []
    for tool in sequence:
        embedding = toolbox[tool]["embedding"]
        converted_sequence.append((tool_name_to_id[tool], embedding))
    
    return converted_sequence

def get_recommendation(base_config, model_base_loc, optimize_base_loc, input_sequence):
    info = utils.load_json(os.path.join(model_base_loc, "info.json"))
    tool_name_to_id = info["tool_name_to_id"]
    all_tool_names = list(tool_name_to_id.keys())
    sequence  = input_sequence
    sequence = validate_sequence(input_sequence, all_tool_names)
    
    if len(sequence) == 0:
        return []
    
    config = base_config.copy()
    best_params = utils.load_json(os.path.join(optimize_base_loc, "best_hyperparameters.json"))
    
    for key, value in best_params.items():
        config[key] = value
    
    config["hidden_channels"] = int(config["hidden_channels"])
    config["step_size"] = int(config["step_size"])
    config["batch_size"] = int(config["batch_size"])
    config["epochs"] = int(config["epochs"])
    config["model_path"] = model_base_loc
    config = data_loader.add_data_config(config)
    
    toolbox, _ = utils.load_toolbox()
    
    id_sequence = convert_sequence(sequence, tool_name_to_id, toolbox)
    data = path_to_data(id_sequence)
    loaded_model = model.load_model(config)
    
    logits = predict(loaded_model.to('cuda'), data.to('cuda'))
    recommendations = torch.topk(logits, constants.TOOLS_TO_RECOMMEND)[1].view(-1).cpu().detach().numpy()
    
    tool_id_to_name = { v: k for k, v in tool_name_to_id.items() }
    recommended_tools = [tool_id_to_name[tool_id] for tool_id in recommendations]
    
    return recommended_tools

model_base_loc = os.path.join(constants.OUT_LOC, constants.MODEL_NAME)
optimize_base_loc = os.path.join(constants.OUT_LOC, "{}_optimize".format(constants.MODEL_NAME))

base_config = {
    "device": "cuda",
    "model_type": constants.MODEL_TYPE,
    "hidden_channels": 32,
    "learning_rate": 0.001,
    "l2_penalty": 0.00001,
    "step_size": 30,
    "weight_decay": 0.1,
    "emb_dropout": 0.0,
    "dropout": 0.0,
    "epochs": 100,
    "batch_size": 100,
    "model_path": optimize_base_loc,
    "model_name": "model.pt",
    "top_k": constants.HITRATE_K,
    "mrr_k": constants.MRR_K,
}


recommended_tools=get_recommendation(base_config, model_base_loc, optimize_base_loc, ['umi_tools_extract','rna_star','bamFilter'])