# Run WES-QC pipeline

This Jupyter notebook runs all steps for WES-QC pipeline

It supports both running form a local machine or from remote cluster.

### Set up jupyter notebook to run on local or on the cluster
By default, the Jupyter notebook is running on the local machine, or any machine that has SSH access to the cluster

For local run you need to have the `wes` hostname, containing the username and IP address 
of your cluster. You can specify it in the SSH config like this:

```
Host wes
    HostName 172.27.1.1
    User ubuntu
    IdentityFile ~/.ssh/id_rsa
``` 

Set it to `True` if you want to run this jupyter notebook on the cluster.

In [1]:
jupyter_notebook_on_cluster = False

### Specify the path to the wes-qc directory

All WES-QC repo should be located on the machine (cluster).
Make sure the config/inputs.yaml symlinks to the correct dataset config.

In [2]:
# path_to_wes = "/home/ubuntu/temprun/wes_qc"
path_to_wes = "/lustre/scratch126/teams/hgi/eh19/wes-qc"

### How to add steps to stages

The notebook runs all steps for a specific stage using the step dictionary,
containing the script step name (withh all command-line parameters if they are needed)
and a corresponding log file name.

To add/modify a step, change this directory. 

In [3]:
# Set up logging
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [4]:
# The small function to run command either via SSH (for local run)
# or directly (when the notebook is on cluster
def run_cmd(cmd):
    if jupyter_notebook_on_cluster:
        !{cmd}
    else:
        !ssh wes "{cmd}"

### Check Hail is working on the cluster

In [5]:
python = "/home/ubuntu/venv/bin/python"
cmd = (
    f'cd {path_to_wes}; {python} -c \\"import hail as hl; hl.init()\\" '
    if not jupyter_notebook_on_cluster
    else f'cd {path_to_wes}; {python} -c "import hail as hl; hl.init()" '
)
run_cmd(cmd)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Running on Apache Spark version 3.5.3
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.133-4c60fddb171a
LOGGING: writing to /home/ubuntu/temprun/wes_qc/hail-20250106-1104-0.2.133-4c60fddb171a.log


### Step 0 - Resource Preparation

In [None]:
step0 = {"1-import_1kg.py --all": "0-1-import-1kg"}
for script, prefix in step0.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes} && ./scripts/hlrun_local --prefix={prefix} 0-resource_preparation/{script}"
    run_cmd(cmd)

2025-01-06 11:04:58,797 - INFO - Running 1-import_1kg.py --all



Running the job with spark-submit
spark-submit 0-resource_preparation/1-import_1kg.py --all
info: script_dir  /home/ubuntu/temprun/wes_qc/0-resource_preparation
Loading config '/home/ubuntu/temprun/wes_qc/0-resource_preparation/../config/inputs.yaml', default
=== Cleaning up temporary folder /lustre/scratch126/teams/hgi/gz3/public-dataset/tmp
=== Initializing Hail ===
Running on Apache Spark version 3.5.3
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.133-4c60fddb171a
LOGGING: writing to /home/ubuntu/temprun/wes_qc/hail-20250106-1105-0.2.133-4c60fddb171a.log
Loading VCFs from file:///lustre/scratch126/teams/hgi/gz3/public-dataset/resources/mini_1000G
2025-01-06 11:05:17.482 Hail: INFO: Reading table without type imputation1) / 1]
  Loading field 'Sample name' as type str (not specified)
  Loading field 'Sex' as type str (not specified)
  Loading field 'Biosample ID' as type str (no

### Step 1 - Import Data

In [None]:
step1 = {
    "1-import_gatk_vcfs_to_hail.py": "1-1-import_gatk_vcfs_to_hail",
    "2-import_annotations.py": "1-2-import_annotations",
}
for script, prefix in step1.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes} && ./scripts/hlrun_local --prefix={prefix} 1-import_data/{script}"
    run_cmd(cmd)

### Step 2 - Sample QC

In [None]:
step2 = {
    # "1-hard_filters_sex_annotation.py": "2-1-hard-filters-sex-annotation",
    # "2-prune_related_samples.py": "2-2-prune-related-samples",
    "3-population_pca_prediction.py --all": "2-3-population-pca-prediction",
    "4-find_population_outliers.py": "2-4-find-population-outliers",
    "5-filter_fail_sample_qc.py": "2-5-filter-fail-sample-qc",
}
for script, prefix in step2.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes}; ./scripts/hlrun_local --prefix={prefix} 2-sample_qc/{script}"
    run_cmd(cmd)

### Step 3 - Variant QC 
Run with the given model id

In [None]:
model_id = "test-run-model"

step3 = {
    "1-generate_truth_sets.py --all": "3-1-generate-truth-sets",
    "2-create_rf_ht.py": "3-2-create-rf-ht",
    "3-train_rf.py --manual-model-id {model_id}": "3-3-train-rf",
}
for script, prefix in step3.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes}; ./scripts/hlrun_local --prefix={prefix} 3-variant_qc/{script}"
    run_cmd(cmd)

yaml_file = "config/inputs.yaml"
cmd = f"sed --follow-symlinks -i 's/rf_model_id:.*/rf_model_id: {model_id}/' {path_to_wes}/{yaml_file}"
run_cmd(cmd)

step3 = {
    "4-apply_rf.py": "3-4-apply-rf",
    "5-annotate_ht_after_rf.py": "3-5-annotate-ht-after-rf",
    "6-rank_and_bin.py": "3-6-rank-and-bin",
    "7-plot_rf_output.py": "3-7-plot-rf-output",
    "8-select_thresholds.py --snv 80 --indel 60": "3-8-select-thresholds",
    "9-filter_mt_after_variant_qc.py --snv 80 --indel 60": "3-9-filter-mt-after-variant-qc",
}
for script, prefix in step3.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes}; ./scripts/hlrun_local --prefix={prefix} 3-variant_qc/{script}"
    run_cmd(cmd)

### Step 4 - Genotype QC

In [None]:
step4 = {
    "1-compare_hard_filter_combinations.py": "4-1-compare-hard-filter-combinations",
    "2-apply_range_of_hard_filters.py": "4-2-apply-range-of-hard-filters",
    "3a-export_vcfs_range_of_hard_filters.py": "4-3a-export-vcfs-range-of-hard-filters",
    "3b-export_vcfs_stingent_filters.py": "4-3b-export-vcfs-stingent-filters",
}
for script, prefix in step4.items():
    print("=" * 120 + "\n")
    logger.info(f"Running {script}")
    cmd = f"cd {path_to_wes}; ./scripts/hlrun_local --prefix={prefix} 4-genotype_qc/{script}"
    run_cmd(cmd)