In [1]:
import logging
import glob
import os
import getpass
from pathlib import Path

from Pegasus.api import *

In [2]:
logging.basicConfig(level=logging.INFO)

# --- Working Directory Setup --------------------------------------------------
# A good working directory for workflow runs and output files
WORK_DIR = Path.home() / "workflows"
WORK_DIR.mkdir(exist_ok=True)

TOP_DIR = Path().resolve()

# --- Properties ---------------------------------------------------------------
props = Properties()
props["pegasus.data.configuration"] = "condorio"  

# Provide a full kickstart record, including the environment, even for successful jobs
props["pegasus.gridstart.arguments"] = "-f"

#Limit the number of idle jobs for large workflows
props["dagman.maxidle"] = "1600"

# Help Pegasus developers by sharing performance data (optional)
props["pegasus.monitord.encoding"] = "json"
props["pegasus.catalog.workflow.amqp.url"] = "amqp://friend:donatedata@msgs.pegasus.isi.edu:5672/prod/workflows"

# write properties file to ./pegasus.properties
props.write()


In [3]:
# --- Sites --------------------------------------------------------------------
sc = SiteCatalog()

# local site (submit machine)
local_site = Site(name="local", arch=Arch.X86_64)

local_shared_scratch = Directory(directory_type=Directory.SHARED_SCRATCH, path=WORK_DIR / "scratch")
local_shared_scratch.add_file_servers(FileServer(url="file://" + str(WORK_DIR / "scratch"), operation_type=Operation.ALL))
local_site.add_directories(local_shared_scratch)

local_storage = Directory(directory_type=Directory.LOCAL_STORAGE, path=TOP_DIR / "outputs")
local_storage.add_file_servers(FileServer(url="file://" + str(TOP_DIR / "outputs"), operation_type=Operation.ALL))
local_site.add_directories(local_storage)

local_site.add_env(PATH=os.environ["PATH"])
sc.add_sites(local_site)

# condorpool (execution site)
condorpool_site = Site(name="condorpool", arch=Arch.X86_64, os_type=OS.LINUX)
condorpool_site.add_pegasus_profile(style="condor")
condorpool_site.add_condor_profile(
    universe="vanilla",
    request_cpus=3,
    request_memory="3 GB",
    request_disk="10000000",
)

sc.add_sites(condorpool_site)

# write SiteCatalog to ./sites.yml
sc.write()

In [4]:
# --- Transformations ----------------------------------------------------------
proteinfold = Transformation(
    name="proteinfold",
    site="local",
    pfn=TOP_DIR / "bin/proteinfold.sh",
    is_stageable="True",
    arch=Arch.X86_64).add_pegasus_profile(clusters_size=10)

tc = TransformationCatalog()
tc.add_transformations(proteinfold)

# write TransformationCatalog to ./transformations.yml
tc.write()

In [5]:
# --- Replicas -----------------------------------------------------------------
exec_file = [File(f.name) for f in (TOP_DIR / "bin").iterdir() if f.name.startswith("AbinitioRelax")]

input_files = [File(f.name) for f in (TOP_DIR / "inputs").iterdir()]

db_files = [File(f.name) for f in (TOP_DIR / "database").iterdir()]

rc = ReplicaCatalog()

for f in input_files:
    rc.add_replica(site="local", lfn=f, pfn=TOP_DIR / "inputs" / f.lfn)

for f in exec_file:
    rc.add_replica(site="local", lfn=f, pfn=TOP_DIR / "bin" / f.lfn)

for f in db_files:
    rc.add_replica(site="local", lfn=f, pfn=TOP_DIR / "database" / f.lfn)

# write ReplicaCatalog to replicas.yml
rc.write()

In [6]:
# --- Workflow -----------------------------------------------------------------
wf = Workflow(name="protein-folding-workflow")

for f in input_files:
    filename = f.lfn.replace(".tar.gz","")
    out_file = File(filename + "_silent.out")

    proteinfold_job = Job(proteinfold).add_args(filename, "-database ./database","-in:file:fasta",f"./{filename}.fasta",
            "-in:file:frag3",f"./{filename}-03_05.200_v1_3",
            "-in:file:frag9",f"./{filename}-09_05.200_v1_3","-in:file:native",f"./{filename}.pdb",
            "-abinitio:relax","-nstruct","1",
            "-out:file:silent", out_file,
            "-use_filters","true","-psipred_ss2",f"./{filename}.psipred_ss2",
            "-abinitio::increase_cycles","10",
            "-abinitio::rg_reweight","0.5","-abinitio::rg_reweight","0.5",
            "-abinitio::rsd_wt_helix","0.5","-abinitio::rsd_wt_loop","0.5","-relax::fast")\
            .add_inputs(exec_file[0],db_files[0],f).add_outputs(out_file)
    wf.add_jobs(proteinfold_job)

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000001, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000002, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000003, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000004, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000005, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000006, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000007, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000008, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000009, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000080, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000081, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000082, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000083, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000084, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000085, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000086, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000087, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000088, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000159, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000160, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000161, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000162, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000163, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000164, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000165, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000166, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000167, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000238, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000239, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000240, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000241, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000242, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000243, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000244, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000245, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000246, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000317, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000318, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000319, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000320, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000321, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000322, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000323, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000324, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000325, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000396, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000397, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000398, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000399, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000400, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000401, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000402, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000403, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000404, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000475, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000476, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000477, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000478, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000479, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000480, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000481, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000482, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000483, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000554, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000555, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000556, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000557, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000558, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000559, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000560, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000561, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000562, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000633, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000634, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000635, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000636, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000637, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000638, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000639, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000640, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000641, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000712, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000713, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000714, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000715, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000716, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000717, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000718, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000719, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000720, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000791, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000792, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000793, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000794, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000795, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000796, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000797, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000798, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000799, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000870, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000871, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000872, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000873, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000874, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000875, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000876, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000877, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000878, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000949, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000950, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000951, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000952, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000953, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000954, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000955, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000956, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id=ID0000957, transformation=proteinfold)
INFO:Pegasus.api.workflow:protein-folding-workflow added Job(_id

In [7]:
# plan and run the workflow
wf.plan(
    dir=WORK_DIR / "runs",
    sites=["condorpool"],
    staging_sites={"condorpool":"local"},
    output_sites=["local"],
    cluster=["horizontal"],
    submit=True
)


INFO:Pegasus.api.workflow:inferring protein-folding-workflow dependencies
INFO:Pegasus.api.workflow:workflow protein-folding-workflow with 1000 jobs generated and written to workflow.yml

################
# pegasus-plan #
################
2023.05.05 08:50:39.473 UTC:
2023.05.05 08:50:39.479 UTC:   -----------------------------------------------------------------------
2023.05.05 08:50:39.484 UTC:   File for submitting this DAG to HTCondor           : protein-folding-workflow-0.dag.condor.sub
2023.05.05 08:50:39.489 UTC:   Log of DAGMan debugging messages                 : protein-folding-workflow-0.dag.dagman.out
2023.05.05 08:50:39.494 UTC:   Log of HTCondor library output                     : protein-folding-workflow-0.dag.lib.out
2023.05.05 08:50:39.500 UTC:   Log of HTCondor library error messages             : protein-folding-workflow-0.dag.lib.err
2023.05.05 08:50:39.505 UTC:   Log of the life of condor_dagman itself          : protein-folding-workflow-0.dag.dagman.log
2023.05.0

<Pegasus.api.workflow.Workflow at 0x7f9251ea6eb8>