# Addons to monitor cluster

In [1]:
from pprint import pprint
import os
import sys

#name of the cluster
your_cluster_name = "mustafa8"

## The private key pair for accessing cluster.
private_key = "/home/mustafa/keys/interns_oregon_key.pem"

## If delete cfncluster after job is done./
delete_cfncluster= False

print("variables set")

variables set


In [2]:
sys.path.append("../../src/cirrus_ngs")
from cfnCluster import CFNClusterManager, ConnectionManager
from util import AddonsManager

## Create a new cluster
master_ip_address = CFNClusterManager.create_cfn_cluster(cluster_name=your_cluster_name)
ssh_client = ConnectionManager.connect_master(hostname=master_ip_address,
               username="ec2-user",
               private_key_file=private_key)

cluster mustafa8 does exist.
Status: CREATE_COMPLETE
Status: CREATE_COMPLETE
MasterServer: RUNNING
MasterServer: RUNNING
Output:"MasterPublicIP"="34.218.52.146"
Output:"MasterPrivateIP"="172.31.47.153"
Output:"GangliaPublicURL"="http://34.218.52.146/ganglia/"
Output:"GangliaPrivateURL"="http://172.31.47.153/ganglia/"

connecting
connected


### Check Scripts on Cluster

This section containss cells that can be used to check which scripts are currently on the cluster being used. 

#### Get all supported pipelines

In [3]:
#This cell must be run before others in this section

scripts = AddonsManager.get_scripts_dict(ssh_client)
print()
print("Supported Pipelines:", AddonsManager.get_all_pipeline_names(scripts))


Supported Pipelines: ['RNASeq', 'DNASeq', 'ChiPSeq', 'SmallRNASeq']


#### Check which workflows are in a given pipeline

In [4]:
#can be set to a supported pipeline name or "all"
target_pipeline = "all"

print("Supported Workflows in {} Pipeline(s): ".format(target_pipeline), end="")
pprint(AddonsManager.get_workflows_in_pipeline(scripts, target_pipeline), indent=2)

Supported Workflows in all Pipeline(s): 
{ 'ChiPSeq': ['homer'],
  'DNASeq': ['bwa_gatk', 'bwa_mutect'],
  'RNASeq': ['star_rsem', 'star_htseq', 'kallisto', 'star_gatk'],
  'SmallRNASeq': ['bowtie2']}


#### Check which shell scripts are in a given pipeline or workflow

In [6]:
#can be set to a supported pipeline or "all"
target_pipeline = "all"

#can be set to a support workflow or "all"
#if target_pipeline == "all" then this variable is ignored
target_workflow = "all"

pprint(AddonsManager.get_scripts(scripts, target_pipeline, target_workflow))

{'All Pipelines': ['fastqc.sh', 'multiqc.sh', 'trim.sh', 'run.sh'],
 'ChiPSeq': {'All Workflows': [],
             'homer': ['pos2bed.sh',
                       'find_motifs_genome.sh',
                       'annotate_peaks.sh',
                       'make_tag_directory.sh',
                       'bowtie.sh',
                       'findpeaks.sh',
                       'make_UCSC_file.sh']},
 'DNASeq': {'All Workflows': ['merge.sh',
                              'split.sh',
                              'sort.sh',
                              'bwa.sh',
                              'post.sh',
                              'dedup.sh'],
            'bwa_gatk': ['haplo.sh', 'group_vcf.sh', 'filter.sh'],
            'bwa_mutect': ['mutect.sh', 'merge_vcf_pairwise.sh']},
 'RNASeq': {'All Workflows': ['RNA_merge_counts.sh', 'make_group.sh'],
            'kallisto': ['k_align.sh', 'k_count.sh'],
            'star_gatk': ['gatk_align.sh', 'gatk_vc.sh'],
            'star_htseq': ['ht_ali

#### Print out a script

In [12]:
#all targets must be set to a valid pipeline/workflow/script
#use cells above to check valid options

target_pipeline = "RNASeq"
target_workflow = "star_gatk"
target_script = "gatk_align.sh"

loc, file_cat = AddonsManager.cat_script(ssh_client, scripts, target_pipeline, target_workflow, target_script)

print("".join(["#"]*len(loc)) + "\n{}\n".format(loc) + "".join(["#"]*len(loc)))
AddonsManager.show_script(file_cat)

Executing cat /shared/workspace/Pipelines/scripts/RNASeq/star_gatk/gatk_align.sh
#################
Workflow Specific
#################


#### Check which step calls a script

In [4]:
#should be the name of a shell script on the cluster (include sh extension)
target_script = "fastqc.sh"

print(AddonsManager.get_steps_calling_script(ssh_client, scripts, target_script))

fastqc.sh called from:
fastqc in all Pipelines



#### Check configuation entries for a step

In [15]:
step_name = "star_gatk_align"
step_tool_config, step_specific_config = AddonsManager.get_step_config_dicts(ssh_client, scripts, step_name)
print(AddonsManager.get_step_config(ssh_client, scripts, step_name, step_tool_config, step_specific_config))

Executing cat /shared/workspace/Pipelines/config/tools.yaml
Executing python /shared/workspace/Pipelines/util/GetAllSpecificConfs.py
Executing cat /shared/workspace/Pipelines/scripts/RNASeq/star_gatk/gatk_align.sh

tools.yaml configuration entry for star_gatk_align step:
can_be_zipped: false
download_suffix: .trim{}
input_is_output: true
script_path: RNASeq/star_gatk/gatk_align
uses_chromosomes: false


RNASeq_star_gatk.yaml configuration entry for star_gatk_align step:
star_gatk_align:
- 4
Argument 1 is num_threads




### Edit Configuration Files and Scripts

#### Edit tools.yaml configuration entry for a step

In [7]:
step_name = "fastqc"

new_step_tools_conf = {
    "can_be_zipped":False,
    "download_suffix":None,
    "input_is_output":False,
    "script_path":"fastqc",
    "uses_chromosomes":False,
    #<=1 of the following should be True
    "all_samples":False,
    "by_pair": False,
    "by_group":False
}

AddonsManager.edit_step_tools_config(ssh_client, new_step_tools_conf, step_name)

Executing cat /shared/workspace/Pipelines/config/tools.yaml
Executing mv -n /shared/workspace/Pipelines/config/tools.yaml /shared/workspace/Pipelines/config/tools.yaml.BACKUP
/home/mustafa/ccbb/cirrus-ngs/notebooks/cirrus-ngs/tools.yaml
/shared/workspace/Pipelines/config/tools.yaml


#### Edit pipeline/workflow specific configuration entry for a step

In [14]:
step_name = "fastqc"
target_pipeline = "DNASeq"
target_workflow = "bwa_mutect"

new_extra_bash_arguments = [1, 36]

AddonsManager.edit_step_specific_config(ssh_client, 
                                        target_pipeline, target_workflow, new_extra_bash_arguments, step_name)

Executing cat /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml
Executing mv -n /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml.BACKUP
/home/mustafa/ccbb/cirrus-ngs/notebooks/cirrus-ngs/DNASeq_bwa_mutect.yaml
/shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml


#### Edit script for a step

Run first two cells, then edit in third cell.
After editing is finished run the third and fourth cells to save and upload new script.

In [9]:
#Run next two cells right after one another
#a new cell with the target script will be created, edit the script in that cell and run it after editing is finished

#all targets must be set to a valid pipeline/workflow/script
#use cells above to check valid options

target_pipeline = "DNASeq"
target_workflow = "bwa_gatk"
target_script = "fastqc.sh"

AddonsManager.edit_script(ssh_client, scripts, target_pipeline, target_workflow, target_script)

Executing cat /shared/workspace/Pipelines/scripts/fastqc.sh


'%%writefile fastqc.sh\n#!/bin/bash\n\nproject_name=$1\nworkflow=$2\nfile_suffix=$3  #extension of input file, does not include .gz if present in input\nroot_dir=$4\nfastq_end1=$5\nfastq_end2=$6\ninput_address=$7    #this is an s3 address e.g. s3://path/to/input/directory\noutput_address=$8   #this is an s3 address e.g. s3://path/to/output/directory\nlog_dir=$9\nis_zipped=${10}    #either "True" or "False", indicates whether input is gzipped\n\n#logging\nlog_dir=$log_dir/$fastq_end1\nmkdir -p $log_dir\nlog_file=$log_dir/\'fastqc.log\'\nexec 1>>$log_file\nexec 2>>$log_file\n\nstatus_file=$log_dir/\'status.log\'\ntouch $status_file\n\n#prepare output directories\nworkspace=$root_dir/$project_name/$workflow/$fastq_end1\nmkdir -p $workspace\n\necho "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"\ndate\necho "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"\n\ncheck_step_already_done $JOB_NAME $status_file\n\n##DOWNLOAD##\nif [ ! -f $workspace/$fastq_end1$file_suffix ]\nthen\n    #this is the suffix of the inp

In [10]:
%recall

In [12]:
#set either target to "all" if script applies to all pipelines or all workflows in a pipeline
target_pipeline = "all"
target_workflow = ""

script_name = "fastqc.sh"

AddonsManager.upload_script(ssh_client, target_pipeline, target_workflow, script_name)

Executing mv -n /shared/workspace/Pipelines/scripts/fastqc.sh /shared/workspace/Pipelines/scripts/fastqc.sh.BACKUP
/home/mustafa/ccbb/cirrus-ngs/notebooks/cirrus-ngs/fastqc.sh
/shared/workspace/Pipelines/scripts/fastqc.sh


In [4]:
AddonsManager.shell_script_template

'#!/bin/bash\n\nproject_name=$1\nworkflow=$2\nfile_suffix=$3\nroot_dir=$4\nfastq_end1=$5\nfastq_end2=$6\ninput_address=$7\noutput_address=$8\nlog_dir=$9\nis_zipped=${10}\n{EXTRA ARGUMENTS HERE}\n\n#logging\nlog_dir=$log_dir/fastq_end1\nmkdir -p $log_dir\nlog_file=$log_dir/{LOG FILE NAME HERE}\nexec 1>>$log_file\nexec 2>>$log_file\n\nstatus_file=$log_dir/\'status.log\'\ntouch $status_file\n\n#prepare output directories\nworkspace=$root_dir/$project_name/$workflow/$fastq_end1\nmkdir -p $workspace\n\necho "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"\ndate\necho "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"\n\ncheck_step_already_done $JOB_NAME $status_file\n\n##DOWNLOAD##\nif [ ! -f $workspace/$fastq_end1$file_suffix ]\nthen\n    download_suffix=$file_suffix\n\n    if [ "$is_zipped" == "True" ]\n    then\n        download_suffix=$file_suffix".gz"\n    fi\n\n    check_exit_status "aws s3 cp $input_address/$fastq_end1$download_suffix $workspace/" $JOB_NAME $status_file\n    gunzip -q $workspace/$fastq_en

In [5]:
%recall

These cells add configuration entries to the new shell script

In [38]:
step_name = "trim"

target_pipeline = "DNASeq"
target_workflow = "bwa_mutect"

new_step_tools_conf = {
    "can_be_zipped":False,
    "download_suffix":None,
    "input_is_output":False,
    "script_path":"trim",
    "uses_chromosomes":False
    #<=1 of the following should be True
#     "all_samples":False,
#     "by_pair": False,
#     "by_group":False
}

new_extra_bash_arguments = [1, 1]
AddonsManager.edit_step_specific_config(ssh_client, 
                                        target_pipeline, target_workflow, new_extra_bash_arguments, step_name)
AddonsManager.edit_step_tools_config(ssh_client, new_step_tools_conf, step_name)

Executing cat /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml
Executing mv -n /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml /shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml.BACKUP
/home/mustafa/ccbb/cirrus-ngs/notebooks/cirrus-ngs/DNASeq_bwa_mutect.yaml
/shared/workspace/Pipelines/config/DNASeq/DNASeq_bwa_mutect.yaml
Executing cat /shared/workspace/Pipelines/config/tools.yaml
Executing mv -n /shared/workspace/Pipelines/config/tools.yaml /shared/workspace/Pipelines/config/tools.yaml.BACKUP
/home/mustafa/ccbb/cirrus-ngs/notebooks/cirrus-ngs/tools.yaml
/shared/workspace/Pipelines/config/tools.yaml


In [16]:
AddonsManager.restore_backups(ssh_client)

Executing python /shared/workspace/Pipelines/util/RestoreBackups.py


### Check Software on Cluster

This section is used to check what software is installed on the cluster.

#### Get all software installed on cluster

In [16]:
software = AddonsManager.get_software_dict(ssh_client)
print("All Installed Software: ")
pprint(software)

Executing python /shared/workspace/Pipelines/util/GetSoftware.py
All Installed Software: 
{'FastQC': ['0.11.3'],
 'HTSeq': ['0.9.1 (installed with pip)'],
 'MultiQC': ['1.3 (installed with conda)'],
 'RSEM': ['1.3.0'],
 'STAR': ['2.3.0e', '2.5.1a', '2.5.3a'],
 'Trimmomatic': ['0.36'],
 'bedtools2': ['2.19.1'],
 'blat': ['36x1'],
 'bowtie': ['1.0.1'],
 'bowtie2': ['2.3.3-linux', '2.3.2-legacy'],
 'bwa': ['0.7.12-r1039'],
 'cutadapt': ['1.14 (installed with conda)'],
 'gatk': ['3.8-0'],
 'ghostscript': ['9.19'],
 'homer': ['4.8.3'],
 'java': ['jre1.8.0_144'],
 'kallisto': ['0.43.1'],
 'picard': ['1.96'],
 'sambamba': ['0.4.7'],
 'samblaster': ['0.1.21'],
 'samtools': ['1.1'],
 'tabix': ['0.2.6'],
 'vcftools': ['0.1.12b'],
 'weblogo': ['2.8']}


#### Check the version of a specific tool

In [27]:
#ignores case
target_tool = "tools"

print(AddonsManager.check_tool_is_installed(software, target_tool))

Did you mean samtools or vcftools or bedtools2?
