# This is brief tutoral on install angsd-wrapper and running ngsF
    This uses jahner's bighorn data as an example
    Below lines should all be ran in the command line on pronghorn
    This is setup to be ran on pronghorn with slurm. Could be modified 

# Create new conda environment

`conda create -n angsdWrap`  
`source activate angsdWrap`  
`conda install -c anaconda gcc`  
`conda install -c anaconda libgcc`  
`conda install -c bioconda samtools`  

# Install angsd-wrapper 
    https://github.com/ANGSD-wrapper/angsd-wrapper
    
    This should be done in a src directory in your preffered working directoty
    for example: /data/gpfs/assoc/parchmanlab/tfaske/src

`cd /PATH/src`

`source activate angsdWrap` 
`git clone https://github.com/mojaveazure/angsd-wrapper.git`  
`cd angsd-wrapper/`  
`./angsd-wrapper setup dependencies` 
`source ~/.bash_profile`

*if you get an gsl error....*

try to install with conda  
`conda install -c conda-forge gsl`

if this does not work, add gsl install from source in parchmanlab association to ~/.bashrc

add gsl  
`export PATH="/data/gpfs/assoc/parchmanlab/src/gsl/bin:$PATH"`

add gsl LD lib path  
`export LD_LIBRARY_PATH="/data/gpfs/assoc/parchmanlab/src/gsl/lib:$LD_LIBRARY_PATH"  
export CPATH=/data/gpfs/assoc/parchmanlab/src/gsl/include/  
export LIBRARY_PATH=/data/gpfs/assoc/parchmanlab/src/gsl/lib/`

After gsl is added to conda or ~/.bashrc, logout and log back in and repeat installation steps

# Get necessary example files and additional shell scripts

`cd PATH/src/angsd-wrapper/  
source activate angsdWrap  
./angsd-wrapper setup data`

Get regions extractor shell script, explained later or on website

`cd PATH/src/angsd-wrapper/  
wget https://gist.githubusercontent.com/mojaveazure/d115bb25eeff3b2df9f9/raw/87d84bcd1a8e9f705d4b3b37639a50f0fd3e8e46/regionsExtracter.sh  
chmod 755 regionsExtracter.sh`

# Running ngsF
    tutorial through angsd-wrapper: https://github.com/mojaveazure/angsd-wrapper/wiki/Inbreeding-Coefficients
    OG: https://github.com/fgvieira/ngsF
    
    Below code can be ran using jupyter notebooks
    
    Main directory should have 2 directories: 
    bam_files -- containing individual bamfiles
    assembly -- contains assembly
    
    The only thing a few things need to be changed in below scripts:
    root_dir path, angsdWrap_dir, assembly files name, cpus, and various file names 
    

In [1]:
import sys
import ipyparallel as ipp
import os, time
import pandas as pd

In [2]:
root_dir = '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO'

In [3]:
cd $root_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [4]:
!mkdir angsd

In [5]:
assembly_dir = os.path.join(root_dir,'assembly')
angsd_dir = os.path.join(root_dir,'angsd')
ngsF_dir = os.path.join(angsd_dir,'ngsF')
bam_dir = os.path.join(angsd_dir,'bam_files')

In [6]:
assembly = os.path.join(assembly_dir,'reference.fasta')
assembly

'/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta'

In [7]:
cd $angsd_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd


In [8]:
!mkdir ngsF

In [9]:
!mkdir bam_files

## Create bam_list file from indv file and move to bam_files dir

In [10]:
indv = pd.read_csv("../SNPcall/filtering/good_snps.recode.vcf.gz.012.indv",header=None,names=['All'])
indv.head()

Unnamed: 0,All
0,EN_AH_10
1,EN_AH_11
2,EN_AH_12
3,EN_AH_13
4,EN_AH_14


In [11]:
good_bams_dir = os.path.join(root_dir,'SNPcall/good_bams/')

In [12]:
for a in indv['All']:
    gb = good_bams_dir + a + '_sorted.bam*' 
    !cp $gb $bam_dir

#### make bam_files list and reindex

In [13]:
!find $bam_dir -name '*.bam' > ngsF/bam_list.txt

In [14]:
bam_list = os.path.join(ngsF_dir,'bam_list.txt')

In [15]:
# make same length
print(len(indv['All']))
!wc -l $bam_list

586
586 /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/ngsF/bam_list.txt


# Both assembly and bam files need to be indexed

#### if needed:  
reindex assembly with samtools (creates .fai file )  
reindex bam files with samtools as well (creates .bai file )  

In [16]:
!samtools faidx $assembly

In [17]:
bam_files = !find $bam_dir -name '*sorted.bam'
len(bam_files),bam_files[0]

(586,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/bam_files/EN_CV_5_sorted.bam')

In [18]:
for bam in bam_files:
    !samtools index $bam

In [19]:
#checks and makes sure all were done
bam_index = !find $bam_dir -name '*.bam'
assert bam_files == bam_index

# Designates contigs / chromosomes / scaffolds of interest for estimation
    
    If you have a genome, maybe restrict analyses to a set number of largest scaffolds or only chromosomes
    Example file should look like
    scaffold_1:
    scaffold_2:
    scaffold_3: 
    etc……
    
    For denovo, pick an artibutary number and run regionsExtractor.sh or run on all contigs. 
    Trimming down will increase computation time and should not change estimates 

   # Example of how to run regionsExtracter.sh
   
   # DID NOT DO!!! if *de novo*. Only if reference genome. 
   
##./regionsExtracter.sh sample_info num_regions [out_directory] [out_name] [ref_gen]

In [14]:
angsdWrap_dir = '/data/gpfs/home/tfaske/g/src/angsd-wrapper'
regionsExt = os.path.join(angsdWrap_dir,'./regionsExtracter.sh')

In [15]:
cd $ngsF_dir

/data/gpfs/assoc/denovo/tfaske/milkweed/angsd/ngsF


In [None]:
contigs = 10000 ## number of randomly sampled contigs
!$regionsExt $bam_list $contigs $ngsF_dir extractedRegions.txt $assembly

# Example extractedRegions.txt with selected chromosomes

In [40]:
extRegions_file = os.path.join(ngsF_dir,'extractedRegions.txt')

In [43]:
!head $extRegions_file

scaffold_1:
scaffold_2:
scaffold_3:
scaffold_4:
scaffold_5:
scaffold_6:
scaffold_7:
scaffold_8:
scaffold_9:
scaffold_10:


In [None]:
!wc -l $extRegions_file

# Copy config file to ngsF dir and change settings
    confer with https://github.com/mojaveazure/angsd-wrapper/wiki/Inbreeding-Coefficient
    
    MAKE SURE N_CORES IS THE SAME AS REQUESTED IN SLURM

In [None]:
#!cp '/data/gpfs/assoc/denovo/src/angsd/Configuration_Files/Inbreeding_Coefficients_Config' $ngsF_dir

In [20]:
#example 
config_file = os.path.join(ngsF_dir,'Inbreeding_Coefficients_Config')

In [21]:
!cat $config_file

#!/bin/bash

set -e
set -u
set -o pipefail

#   A simple script to hold variables for the NGS_F
#   Are you using the Common_Config file?
#       If so, where is it?
COMMON=/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/ngsF

##############################################################################################
#   If we aren't using the Common_Config file, specify these variables
#   If Common_Config is specified, leave these blank
#   Define a list of samples
SAMPLE_LIST=/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/ngsF/bam_list.txt

#   Ancestral and Reference sequences
ANC_SEQ=/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta
REF_SEQ=/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta

#   Name the project
PROJECT=output

#   Where do we put the outfiles?
    #   Note, the final outdirectory will be
    #   ${SCRATCH}/${PROJECT}/ngsF
SCRATCH=/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO

# Run ngsF with slurm!

    the below code with make a slurm script for you
    
    I recommend running it first through the command line and then kill it just to check if it works! 
    
    cd PATH/ngsF
    source activate angsdWrap
    angsd-wrapper Inbreeding Inbreeding_Coefficients_Config

Things you should need to change:
root_dir, angsdWrap, cpus, email, account, part 

In [22]:
cd $ngsF_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/ngsF


In [23]:
angsdWrap = '/data/gpfs/assoc/parchmanlab/tfaske/src/angsd-wrapper/./angsd-wrapper' #MAKE SURE YOU USE FULL PATH (no symbolic links)
ntasks = 2
cpus = 32
time = '14-00:00:00'
mem_cpu = 4000
email = 'tfaske@nevada.unr.edu'
account = 'cpu-s5-denovo-0'
part = 'cpu-core-0'

In [24]:
def write_ngsF_sh(account,part,ntasks,cpus,time,mem_cpu,email,ngsF_dir,angsdWrap,config_file):
    with open("run_ngsF.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name ngsF
#SBATCH --output output_ngsF.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

## change into the ngsF directory
cd %s \n\n""" % (account,part,time,ntasks,cpus,mem_cpu,email,ngsF_dir))

        #angsd-wrapper Inbreeding Inbreeding_Coefficients_Config
        o.write("""%s Inbreeding %s"""% (angsdWrap,config_file))

In [25]:
write_ngsF_sh(account,part,ntasks,cpus,time,mem_cpu,email,ngsF_dir,angsdWrap,config_file)

# run in command line 
    cd PATH/ngsF
    source activate angsdWrap
    sbatch run_ngsF.sh