From e3247146bfd8182997ca2a8a604b96d9573e3160 Mon Sep 17 00:00:00 2001
From: Tarun Mamidi <tmamidi@uab.edu>
Date: Thu, 1 Feb 2024 17:19:40 -0600
Subject: [PATCH] PR requests part-2

---
 .test_data/README                       |  2 +-
 .test_data/{file_list => file_list.txt} |  0
 CHANGELOG.md                            |  8 ++++++++
 README.md                               | 21 +++++++++++++--------
 model.job                               |  2 +-
 pipeline.nf                             |  2 +-
 src/analysis/filter.sh                  |  2 +-
 7 files changed, 25 insertions(+), 12 deletions(-)
 rename .test_data/{file_list => file_list.txt} (100%)

diff --git a/.test_data/README b/.test_data/README
index 74d8dba..c0fa03a 100644
--- a/.test_data/README
+++ b/.test_data/README
@@ -4,4 +4,4 @@ This directory has 3 files -
 
 `testing_variants_hg38.vcf.gz` - We custom made a test VCF file with few variants from every chromosome (1-22,X,Y)
 
-`file_list` - contains list of above 2 test vcf files with relative path. This file is used to test nextflow pipeline
+`file_list.txt` - contains list of above 2 test vcf files with relative path. This file is used to test nextflow pipeline
diff --git a/.test_data/file_list b/.test_data/file_list.txt
similarity index 100%
rename from .test_data/file_list
rename to .test_data/file_list.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index df89b36..1b1911b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,3 +10,11 @@ YYYY-MM-DD  John Doe
 ```
 
 ---
+
+```txt
+2024-02-01  Tarun Mamidi
+
+* Uses OpenCRAVAT for annotations
+* Uses Neural Networks from keras instead of traditional scikit-learn models
+* Nextflow pipeline to annotate, parse and DITTO predictions
+```
diff --git a/README.md b/README.md
index 41b87f2..aab4140 100644
--- a/README.md
+++ b/README.md
@@ -14,12 +14,16 @@ Markdown](https://github.com/uab-cgds-worthey/DITTO/actions/workflows/linting.ym
 DITTO is an explainable neural network that can be helpful for accurate and rapid interpretation of small
 genetic variants for pathogenicity using patient’s genotype (VCF) information.
 
-## Usage
+## Using DITTO
+
+DITTO scores for variants can be obtained by the below 3 ways. Webapp and API are for single variant analysis and the
+local setup is for batch/bulk variant predictions.
 
 ### Webapp
 <!-- markdown-link-check-disable -->
 DITTO is available for public use at this [website](https://cgds-ditto.streamlit.app/).
 <!-- markdown-link-check-enable -->
+
 ### API
 
 DITTO is not hosted as a public API but one can serve up locally to query DITTO scores. Please follow the instructions
@@ -72,13 +76,13 @@ Please follow the steps mentioned in [install_openCravat.md](docs/install_openCr
 
 #### Run DITTO pipeline
 
-Create an environment via conda or pip. Below is an example to install `nextflow`.
+Create an environment via conda. Below is an example to install `nextflow`.
 
 - [Anaconda virtual environment](https://docs.anaconda.com/free/anaconda/install/index.html)
 
 ```sh
 # create environment. Needed only the first time. Please use the above link if you're not using Mac.
-conda create --name envi ditto-env
+conda create --name ditto-env
 
 conda activate ditto-env
 
@@ -86,11 +90,12 @@ conda activate ditto-env
 conda install bioconda::nextflow
 ```
 
-Please make a samplesheet with VCF files (incl. path). Please make sure to edit the directory paths as needed.
+Please make a samplesheet with VCF files (incl. path). Please make sure to edit the directory paths as needed and run
+the pipeline as shown below.
 
 ```sh
 nextflow run pipeline.nf \
-  --outdir /data/ \
+  --outdir ./data/ \
   -work-dir ./wor_dir \
   --build hg38 -with-report \
   --oc_modules /data/opencravat/modules \
@@ -114,6 +119,6 @@ For queries, please open a GitHub issue.
 For urgent queries, send an email with clear description to
 
 |Name | Email |
-------|--------|
-Tarun Mamidi | <tmamidi@uab.edu>
-Liz Worthey | <lworthey@uab.edu>
+|------|--------|
+|Tarun Mamidi | <tmamidi@uab.edu>|
+|Liz Worthey | <lworthey@uab.edu>|
diff --git a/model.job b/model.job
index 6860325..b25aefa 100644
--- a/model.job
+++ b/model.job
@@ -28,6 +28,6 @@ module load Anaconda3
   --outdir /data/results \
   -work-dir .work_dir/ \
   --build hg38 -c cheaha.config -with-report \
-  --sample_sheet .test_data/file_list -resume
+  --sample_sheet .test_data/file_list.txt -resume
 
 #https://training.nextflow.io/basic_training/cache_and_resume/#how-to-organize-in-silico-experiments
diff --git a/pipeline.nf b/pipeline.nf
index 7585314..76c6023 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -2,7 +2,7 @@
 nextflow.enable.dsl=2
 
 // Define the command-line options to specify the path to VCF files
-params.sample_sheet = '.test_data/file_list'
+params.sample_sheet = '.test_data/file_list.txt'
 params.build = "hg38"
 params.oc_modules = "/data/project/worthey_lab/projects/experimental_pipelines/tarun/opencravat/modules"
 // Define the Scratch directory
diff --git a/src/analysis/filter.sh b/src/analysis/filter.sh
index 262acbc..077e797 100644
--- a/src/analysis/filter.sh
+++ b/src/analysis/filter.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+set -euo pipefail
 # Filter the DITTO scores and other annotations after running the pipeline. Example tested on CAGI project
 
 # Specify the input folder containing the CSV files