# 0. Run RNA-seq Alignment Pipeline

This notebook executes a complete RNA-seq alignment pipeline by:
1. Fetching sample metadata using nf-core/fetchngs
2. Downloading and optionally subsampling FASTQ files
3. Running the nf-core/rnaseq pipeline for alignment and quantification

### Required User Input

1. Define the following in the configuration cell:
   - `EXPERIMENT_ID`: Unique identifier for the experiment
   - `EXTRA_EXPERIMENT_IDS`: Extra IDs to process with the primary experiment_id
   - `STAR_REFERENCE_DIRECTORY`: Path to reference genome files
   - `AWS`: Boolean flag for AWS execution

2. Ensure reference files are available:
   - STAR index
   - RSEM index
   - Salmon index
   - Reference FASTA
   - GTF annotation
   
3. Copy and run nextflow commands in terminal

#### Expected Input Files
1. Configuration files in `configs/` directory:
   - `fetchngs.config`
   - `rnaseq.config`
2. STAR index described above

#### Output Files
1. `fetchngs_output/`
   - Sample metadata
   - Downloaded FASTQ files
2. `rnaseq_output/`
   - Aligned BAM files
   - Gene/transcript quantification
   - MultiQC report
   - Pipeline execution logs

In [9]:
from pathlib import Path
import shlex
import shutil
import os 
import glob

import pandas as pd
import csv
import s3fs

from src.utils import check_gzipped_fastq_integrity

### 0.1 Configure Notebook

#### Required Variable Definitions:

Processing Parameters
- *SAMPLING_DEPTH*: Target read depth for subsampling (default: 50M reads)
- *SAMPLING_CUTOFF*: Threshold above which samples will be subsampled (default: 75M reads)
- *AWS*: Boolean flag to determine if running on AWS infrastructure

File Paths
- *BASE_DIR*: Root directory path for the project (default: current working directory)
- *CONFIG_DIR*: Directory containing Nextflow configuration files
- *EXPERIMENT_ID*: Unique identifier for the experiment
- *EXTRA_EXPERIMENT_IDS*: List of additional experiment IDs to process
- *OUTPUT_DIR*: Output directory for pipeline results (default: BASE_DIR/expression_atlas/runs/EXPERIMENT_ID)
- *STAR_REFERENCE_DIRECTORY*: Path to directory containing reference genome files
- *STAR_S3_DIRECTORY*: S3 folder with star directory

In [25]:
# CONFIG_DIR and STAR_REFERENCE_DIRECTORY need to be set. Nexflow configs in configs directory. 

BASE_DIR = Path.cwd()
EXPERIMENT_ID = BASE_DIR.parts[-2]
EXTRA_EXPERIMENT_IDS = []

CONFIG_DIR = BASE_DIR / 'configs'
OUTPUT_DIR = BASE_DIR.parent

STAR_REFERENCE_DIRECTORY = Path('/data/expression_atlas/genome/GRCh38.p14')
STAR_S3_DIRECTORY = 's3://velia-data-dev/VDC_004_annotation/genomes/GRCh38.p14'
AWS = False

SAMPLING_DEPTH = 50000000
SAMPLING_CUTOFF = 75000000

#### 0.2 Download reference if does not exist locally. 

In [None]:
# Check if star index downloaded, and download if not. 

s3 = s3fs.S3FileSystem()

STAR_REFERENCE_DIRECTORY.mkdir(exist_ok=True, parents=True)
if not (
    (STAR_REFERENCE_DIRECTORY / 'star').exists()
    and (STAR_REFERENCE_DIRECTORY / 'salmon').exists()
):
    s3.get(STAR_S3_DIRECTORY, STAR_REFERENCE_DIRECTORY, recursive=True)

#### 0.3 Create fetchngs project file 

In [None]:
with open(OUTPUT_DIR / 'projects.txt', 'w') as f:
    f.write(f'{EXPERIMENT_ID}\n')
    for ID in EXTRA_EXPERIMENT_IDS:
        f.write(f'{ID}\n')

#### 0.4 Run nf-core/fetchngs to retrieve experiment metadata required for running nf-core/rnaseq

In [None]:
nxf_cmd_metadata = f"""
nextflow run {BASE_DIR}/fetchngs \
--input {OUTPUT_DIR}/projects.txt \
--outdir {OUTPUT_DIR}/fetchngs_output \
--nf_core_pipeline rnaseq \
--skip_fastq_download \
-resume \
-profile docker \
-c {CONFIG_DIR}/fetchngs.config
"""
print(nxf_cmd_metadata.strip())

#### 0.5 Read samplesheet to determine if subsampling/filtering necessary

In [None]:
# Read samplesheet and determine if subsampling necessary. Filter unwanted samples/conditions out of samplesheet. 

samplesheet = pd.read_csv(
    OUTPUT_DIR / 'fetchngs_output' / 'metadata' / f'{EXPERIMENT_ID}.runinfo_ftp.tsv', 
    sep='\t',
)
subsample_on = (samplesheet['read_count'] > SAMPLING_CUTOFF).any()

samples_to_remove = []
samplesheet = samplesheet.loc[
    ~samplesheet['experiment_accession'].isin(samples_to_remove)
]

samplesheet.to_csv(
    OUTPUT_DIR / 'fetchngs_output' / 'metadata' / f'{EXPERIMENT_ID}.runinfo_ftp_edit.tsv',
    sep='\t',
)

#### 0.6 Run custom download from sra bucket

In [None]:
# Specify the path to downloaded metatdata below, for samplesheet.

nxf_cmd_fastq = f"""
nextflow run {BASE_DIR}/nf_download \
--samplesheet {OUTPUT_DIR}/fetchngs_output/metadata/{EXPERIMENT_ID}.runinfo_ftp_edit.tsv \
--output_directory {OUTPUT_DIR}/fetchngs_output/fastq \
--subsample {'true' if subsample_on else 'false'} \
--sampling_depth {SAMPLING_DEPTH} \
--sampling_cutoff {SAMPLING_CUTOFF} \
-w {OUTPUT_DIR}/work \
-profile {'docker' if not AWS else 'aws'}
"""
print(nxf_cmd_fastq.strip())

#### 0.7 Rearrange samplesheet fetch from fetchngs to make it compatible with nf-core rnaseq 

In [None]:
samplesheet = pd.read_csv(
    OUTPUT_DIR / 'fetchngs_output' / 'samplesheet' / 'samplesheet.csv'
)

# Populate correct paths in 
samplesheet.loc[:, 'fastq_1'] = samplesheet.apply(
    lambda x:
        OUTPUT_DIR / 'fetchngs_output' / 'fastq' / (
            f'{x["experiment_accession"]}_{x["run_accession"]}.fastq.gz' if x["library_layout"] == "SINGLE" else 
            f'{x["experiment_accession"]}_{x["run_accession"]}_1.fastq.gz'
        )
    axis=1,
)
samplesheet.loc[:, 'fastq_2'] = samplesheet.apply(
    lambda x:
        (OUTPUT_DIR / 'fetchngs_output' / 'fastq' / f'{x["experiment_accession"]}_{x["run_accession"]}_2.fastq.gz')
        if x["library_layout"] == "SINGLE" else None
    axis=1,
)
# Filter samplesheet for samples that exist. 
samplesheet = samplesheet.loc[
    samplesheet.apply(
        lambda x: Path(x["fastq_1"]).exists() if x["library_layout"] == "SINGLE" else (
            Path(x["fastq_1"]).exists() and Path(x["fastq_2"]).exists()
        ),
        axis=1,
    )
]

# Check that fastqs that do exist are unzippable. 
samplesheet = samplesheet['fastq_1'].map(
    lambda x: check_gzipped_fastq_integrity(x)
)
samplesheet = samplesheet['fastq_2'].map(
    lambda x: check_gzipped_fastq_integrity(x)
)

samplesheet.to_csv(
    OUTPUT_DIR, 'fetchngs_output' / 'samplesheet', 'samplesheet_edit.csv',
    index=False, 
    quoting=csv.QUOTE_NONNUMERIC,
)
samplesheet

#### 0.8 Run nf-core/rnaseq pipeline

In [None]:
nxf_cmd_align = f"""
nextflow run {BASE_DIR}/rnaseq \
--input {OUTPUT_DIR}/fetchngs_output/samplesheet/samplesheet_edit.csv \
--fasta {STAR_REFERENCE_DIRECTORY}/rsem/GRCh38.p14.genome.fa \
--gtf {STAR_REFERENCE_DIRECTORY}/veliadb_v0c.gtf \
--star_index {STAR_REFERENCE_DIRECTORY}/star \
--transcript_fasta {STAR_REFERENCE_DIRECTORY}/rsem/genome.transcripts.fa \
--rsem_index {STAR_REFERENCE_DIRECTORY}/rsem \
--salmon_index {STAR_REFERENCE_DIRECTORY}/salmon \
--outdir {OUTPUT_DIR}/rnaseq_output \
-w {OUTPUT_DIR}/work \
-profile docker \
-c {CONFIG_DIR}/rnaseq.config \
-resume
"""
print(nxf_cmd_align.strip())

#### 0.9 Clean up nextflow directories and remove fastqs 

In [None]:
# Remove work directory and downloaded fastq files. 

shutil.rmtree(OUTPUT_DIR / "work")
for fq in (OUTPUT_DIR / 'fetchngs_output' / 'fastq').glob('*.gz'):
    fq.unlink()