# miRNA Pipeline (with MultiQC analysis)

Notice: Please open the notebook under /notebooks/awsCluster/BasicCFNClusterSetup.ipynb to install CFNCluster package on your Jupyter-notebook server before running the notebook.

# 1. Configure AWS key pair, data location on S3 and the project information

In [None]:
import os
import sys

## S3 input and output addresses.
# Notice: DO NOT put a forward slash at the end of your addresses.
s3_input_files_address = "s3://path/to/s3_input_files_address"
s3_output_files_address = "s3://path/to/s3_output_files_address"

## Directory for this notebook - for displaying multiqc report
notebook_dir = "path/to/notebook"
    
## Path to the design file
design_file = "/path/to/design_file.txt"

## CFNCluster name
your_cluster_name = "cluster_name"

## The private key pair for accessing cluster.
private_key = "/path/to/aws_private_key.pem"

## Project information
# Recommended: Specify year, month, date, user name and pipeline name (no empty spaces)
project_name = "your_project_name"

## Workflow information: only "bowtie2" now
workflow = "bowtie2"

## Genome information: currently available genomes: human, mouse
genome = ""

## "fastqc", "trim", "cut_adapt", "align_and_count", "merge_counts", "multiqc"
analysis_steps = {""}

## If delete cfncluster after job is done.
delete_cfncluster = False

print("Variables set.")

# 2. Create CFNCluster

Notice: The CFNCluster package can be only installed on Linux box which supports pip installation.

In [None]:
sys.path.append("../../src/cirrus_ngs")
from cfnCluster import CFNClusterManager, ConnectionManager
## Create a new cluster
master_ip_address = CFNClusterManager.create_cfn_cluster(cluster_name=your_cluster_name)
ssh_client = ConnectionManager.connect_master(hostname=master_ip_address,
                                              username="ec2-user",
                                              private_key_file=private_key)

# 3. Run the miRNA sequencing pipeline

In [None]:
from util import PipelineManager
from util import DesignFileLoader

## DO NOT edit below
genome = "hairpin_{}".format(genome)
print(genome)

sample_list, group_list, pair_list = DesignFileLoader.load_design_file(design_file)

PipelineManager.execute("SmallRNASeq", ssh_client, project_name, workflow, analysis_steps, s3_input_files_address,
                       sample_list, group_list, s3_output_files_address, genome, "NA", pair_list)

# 4. Display MultiQC report

### Note: Run the cells below after all jobs are done on the cluster.

In [None]:
# Download the html file to local (in the same directory with this notebook)
!aws s3 cp $s3_output_files_address/$project_name/$workflow/multiqc_report.html $notebook_dir

In [None]:
from IPython.display import IFrame
  
path_to_report = os.path.relpath(notebook_dir + "/multiqc_report.html")

IFrame(path_to_report, width="100%", height=1000)