In [None]:
%%writefile main.nf
#!/usr/bin/env nextflow

/* pipeline input parameters, update this to your data dir */

bdcat_vcfs =  "$baseDir/gdc_vcf/*.vcf.gz"
reference_file =  "$baseDir/gdc_vcf/aacr.chip.project.tsv"

project_dir = projectDir

process vcf_extraction {
    
    label 'vcf_analysis'
    
    input:
    path vcf_files
    file chips_ref_file
    
    output:
    stdout emit: gdc_api_log
    path('*.csv'), emit: csv_files
    
    script:
    """
    python3 /app/analyze_vcf_plain.py --input-vcf $vcf_files --output-csv 'analysis_results.csv' --chip-truth-variants $chips_ref_file
    """
    
}

// Define the entry workflow (initial workflow for Nextflow to run)
workflow {
   
    bdcat_vcf_files = Channel.fromPath( bdcat_vcfs )
    ref_file = file(reference_file)
    vcf_extraction(bdcat_vcf_files, ref_file)
}

In [None]:
%%writefile nextflow.config


process {
    withLabel: vcf_analysis {
        executor = 'awsbatch'
        queue = 'placeholder'
        container = 'placeholder'
    } 
}

aws {
    region = 'us-east-1'
    batch {
        cliPath = '/home/ec2-user/miniconda/bin/aws'
        jobRole = 'placeholder'
    }
}
workDir = 'placeholder'


docker.enabled = true

In [None]:
!nextflow run main.nf -dsl2

In [None]:
!pip install -q awscli
import os
import pandas as pd

In [None]:
# Get the aws endpoints for each batch process. Since we are running two processes on 5 threads there will be 10 total endpoints.
end_points = []
with open(".nextflow.log", 'r') as f:
    for line in f:
        if "COMPLETED" in line:
            end_points.append(line.split(' ')[-1][:-2])

In [None]:
# Download the results from each batch session. The results are all placed into a local 'results' folder.
for i in range(len(end_points)):
    command = f'aws s3 cp {end_points[i]}/ ./results/ --recursive --exclude "*" --include "*" --quiet'
    os.system(command)

In [None]:
results_df = pd.read_csv('results/analysis_results.csv')
results_df