# Installing GATK + notes
    This page will be a reference for installation, different tutorials and how to call polyploids

  #### Get GATK jar and .yml files here:
  https://github.com/broadinstitute/gatk/releases
  
  wget .zip file to a src directory, unzip, run conda install below in that direcoty
  
  #### directions for conda install here:
  https://gatk.broadinstitute.org/hc/en-us/articles/360035889851--How-to-Install-and-use-Conda-for-GATK4
  
  #### vivaswat notes
  https://microcollaborative.atlassian.net/wiki/spaces/BL/pages/73957377/Bioinformatics+for+polyploids
  
  #### monnahan et al 2019   
  
  https://github.com/pmonnahan/ArenosaPloidy

  
  
  #### IDEA: 
  
  - call snps with GATK haplotype caller. 
  - run one per sample with correct ploidy into gvcf
  - then use GenotypeGVCF to call snps across all gvcfs, no need to tell ploidy.  
  

In [257]:
import sys

#sys.path.append('/home/faske/g/anaconda3/envs/py34/lib/python3.4/site-packages')
sys.path.append('/data/gpfs/assoc/parchmanlab/tfaske/anaconda3/envs/py36/lib/python3.6/site-packages')
sys.path.append("/data/gpfs/assoc/parchmanlab/tfaske/ipynb/include_utils")

import ipyparallel as ipp
import os, time
from os import environ
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
#import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE, call, check_output
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
#import pysam
from collections import OrderedDict, namedtuple, Counter
import operator
import multiprocessing as mp
import shutil
import tempfile
#from ipyparallel import Client
import scandir
import glob
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import pickle
import re
from itertools import chain
import socket
import random
#import Levenshtein as lv

In [2]:
root = "/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo"

In [3]:
snp_dir = os.path.join(root,'SNPcall')

In [4]:
bam_dir = os.path.join(snp_dir,'good_bams')

In [5]:
cd $bam_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams


In [6]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [7]:
len(bam_files), bam_files[0]

(695,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_sorted.bam')

In [26]:
assembly = "/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/assembly/reference.fasta"

#assembly indexed with dDocent

In [75]:
!samtools index {assembly}

samtools index: "/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/assembly/reference.fasta" is in a format that cannot be usefully indexed


## SNP CALL with freebayes TEST

call from dDocent: freebayes -b split.1.bam -t mapped.1.bed -v raw.1.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10 2> fb.1.error.log

**settings to consider**:
  - -L --bam-list FILE A file containing a list of BAM files to be analyzed.
  - -A --cnv-map FILE
                   Read a copy number map from the BED file FILE, which has
                   the format:
                      reference sequence, start, end, sample name, copy number
        ex: 
        chrX -1 -1 NA12878 2  # means NA12878 has copy number 2 in all of chrX
        chrX -1 -1 NA12879 1  # ... copy number 1
        chrX -1 -1 NA13469 2
        chrX -1 -1 NA21328 2
        chrX -1 -1 NA12970 2
        chrY -1 -1 NA12877 1
        chrX -1 -1 NA12877 1
        chr20 20000 30000 NA12877 1   # NA12877 has one copy of chr20:20000..30000
        chr20 20000 30000 NA12879 3   # NA12879 has four copies of chr20:20000..30000

Do a test: move ~15 files to a test_dir


In [9]:
test_dir = os.path.join(snp_dir,'test')
testbwa_dir = os.path.join(test_dir,'good_bams')

In [10]:
cd $testbwa_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams


In [11]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [12]:
len(bam_files), bam_files[0]

(15,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_EW_36_sorted.bam')

In [13]:
testfb_dir = os.path.join(test_dir,'fb')

In [38]:
cd $test_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test


**reindex all bamfiles**

In [50]:
for b in bam_files:
    environ['b'] = b
    !samtools index $b

**make a bam list file**

In [53]:
def make_bam_list(bam_files):
    with open("bam_files.txt", "w") as o:
        for b in bam_files:
            #bam = b.split('/')[11] 
            #o.write("good_bams/%s \n" % (bam)) 
            o.write("%s \n" % (b)) 

In [54]:
make_bam_list(bam_files)

**make ploidy_list.txt**  
sample_name copy_number

In [84]:
## loop through *.fai write out contig -1 -1 ID ploidy
fai = pd.read_csv(assembly + '.fai',header=None,sep='\t')

In [93]:
contigs = fai.iloc[:,0].tolist()
len(contigs),contigs[:5]

(61970,
 ['dDocent_Contig_6',
  'dDocent_Contig_15',
  'dDocent_Contig_18',
  'dDocent_Contig_20',
  'dDocent_Contig_23'])

In [96]:
## loop through *.fai write out contig -1 -1 ID ploidy
def make_ploidy_list(bam_files):
    with open("ploidy_list.txt", "w") as o:
        for c in contigs:
            for b in bam_files:
                bam = b.split('/')[11]
                ID = bam.split('_sorted.bam')[0] ### This too
                ploidy = ID.split('_')[0]
                ploidy = re.sub(r'\D','',ploidy)
                #print(b)
                #print(ploidy)
                #o.write("good_bams/%s\t%s\n" % (bam,ploidy)) 
                o.write("%s\t-1\t-1\t%s\t%s\n" % (c,ID,ploidy))

In [97]:
make_ploidy_list(bam_files)

In [98]:
#ploidy list should be contigs*bam
len(contigs)*len(bam_files)

929550

In [99]:
!wc -l ploidy_list.txt

929550 ploidy_list.txt


**run freebayes_sh** 

In [57]:
### select options for slurm submission
#account = 'cpu-s5-denovo-0'
#partition = 'cpu-core-0'
account = 'cpu-s1-bionres-0'
partition = 'cpu-s1-bionres-0'
jobname = 'fb_test'
time = '1-00:00:00' #time limit 1 day
cpus = 1
mem_cpu = 80000
email = 'tfaske@nevada.unr.edu'

In [100]:
def write_fb_sh(account,partition,time,jobname,cpus,mem_cpu,email,assembly):
    with open("run_fb_test.sh", "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name fb_test
#SBATCH --output output_fb_test.txt
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=%s \n
    
freebayes -b good_bams/*.bam --cnv-map ploidy_list.txt -f %s -v fb_test.vcf -V\n
""" % (account,partition,time,int(cpus),int(mem_cpu),email,assembly))
            

In [101]:
write_fb_sh(account,partition,time,jobname,cpus,mem_cpu,email,assembly)

### Run run_fb_test_sh locally
    cd /data/gpfs/assoc/denovo/tfaske/sagebrush/SNPcall/test
    source activate py36
    bash run_fb_test.sh

## SNP CALL with freebayes FORREAL

call from dDocent: freebayes -b split.1.bam -t mapped.1.bed -v raw.1.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10 2> fb.1.error.log

**settings to consider**:
  - -L --bam-list FILE A file containing a list of BAM files to be analyzed.
  - -A --cnv-map FILE
                   Read a copy number map from the BED file FILE, which has
                   the format:
                      reference sequence, start, end, sample name, copy number
        ex: 
        chrX -1 -1 NA12878 2  # means NA12878 has copy number 2 in all of chrX
        chrX -1 -1 NA12879 1  # ... copy number 1
        chrX -1 -1 NA13469 2
        chrX -1 -1 NA21328 2
        chrX -1 -1 NA12970 2
        chrY -1 -1 NA12877 1
        chrX -1 -1 NA12877 1
        chr20 20000 30000 NA12877 1   # NA12877 has one copy of chr20:20000..30000
        chr20 20000 30000 NA12879 3   # NA12879 has four copies of chr20:20000..30000


In [102]:
bwa_dir = os.path.join(snp_dir,'good_bams')

In [103]:
cd $bwa_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams


In [104]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [105]:
len(bam_files), bam_files[0]

(695,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_sorted.bam')

In [106]:
cd $snp_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall


**reindex all bamfiles**

In [107]:
for b in bam_files:
    environ['b'] = b
    !samtools index $b

**make a bam list file**

In [108]:
def make_bam_list(bam_files):
    with open("bam_files.txt", "w") as o:
        for b in bam_files:
            #bam = b.split('/')[11] 
            #o.write("good_bams/%s \n" % (bam)) 
            o.write("%s \n" % (b)) 

In [109]:
make_bam_list(bam_files)

**make ploidy_list.txt**  
sample_name copy_number

In [110]:
## loop through *.fai write out contig -1 -1 ID ploidy
fai = pd.read_csv(assembly + '.fai',header=None,sep='\t')

In [111]:
contigs = fai.iloc[:,0].tolist()
len(contigs),contigs[:5]

(61970,
 ['dDocent_Contig_6',
  'dDocent_Contig_15',
  'dDocent_Contig_18',
  'dDocent_Contig_20',
  'dDocent_Contig_23'])

In [114]:
## loop through *.fai write out contig -1 -1 ID ploidy
def make_ploidy_list(bam_files):
    with open("ploidy_list.txt", "w") as o:
        for c in contigs:
            for b in bam_files:
                bam = b.split('/')[10]
                ID = bam.split('_sorted.bam')[0] ### This too
                ploidy = ID.split('_')[0]
                ploidy = re.sub(r'\D','',ploidy)
                #print(b)
                #print(ploidy)
                #o.write("good_bams/%s\t%s\n" % (bam,ploidy)) 
                o.write("%s\t-1\t-1\t%s\t%s\n" % (c,ID,ploidy))

In [115]:
make_ploidy_list(bam_files)

In [116]:
#ploidy list should be contigs*bam
len(contigs)*len(bam_files)

43069150

In [117]:
!wc -l ploidy_list.txt

43069150 ploidy_list.txt


**run freebayes_sh** 

In [135]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
#account = 'cpu-s1-bionres-0'
#partition = 'cpu-s1-bionres-0'
jobname = 'fb_full'
time = '3-00:00:00' #time limit 10 day
cpus = 1
mem_cpu = 100000
email = 'tfaske@nevada.unr.edu'

In [136]:
def write_fb_sh(account,partition,time,jobname,cpus,mem_cpu,email,assembly):
    with open("run_fb_full.sh", "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name fb_full
#SBATCH --output output_fb_full.txt
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=%s \n
    
freebayes -b good_bams/*.bam --cnv-map ploidy_list.txt -f %s -v fb_full.vcf -V -d -i -u \n
""" % (account,partition,time,int(cpus),int(mem_cpu),email,assembly))
            

In [137]:
write_fb_sh(account,partition,time,jobname,cpus,mem_cpu,email,assembly)

### Run run_fb_sh locally
    cd /data/gpfs/assoc/denovo/tfaske/sagebrush/SNPcall
    source activate py36
    bash run_fb_full.sh

## New strategy: Split bams and run different jobs

SNP calling keeps failing. Split bams into x jobs and run. Try with test first   

**Steps:**   
    - get mapped regions
    - extract mapped regions to reference
    - make bed file 
    - split bed file in X based off number of jobs. 
    - split bams based off bed files 
    - make slurm script for each split bed file 
    - run all files 

In [229]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [230]:
len(bam_files), bam_files[0]

(15,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_EW_36_sorted.bam')

In [238]:
def index_bam(bam):
    cmd = "samtools index %s" % (bam)
    !$cmd
    return bam

In [239]:
def get_mapped(bam):
    out = "%s_mapped.bam" % bam.split("_sorted.bam")[0]
    if not os.path.exists(out):                                       
        cmd = "samtools view -b -F 4 %s > %s" % (bam, out)
        res = !$cmd
    index_bam(out)
    return bam

In [240]:
for b in bam_files:
    get_mapped(b)

In [362]:
mapped_bam = []
files = !find . -type f -name '*mapped.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    mapped_bam.append(x)
mapped_bam = sorted(mapped_bam)

In [363]:
len(mapped_bam), mapped_bam[0]

(15,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_EW_36_mapped.bam')

**get contigs that mapped**

In [247]:
def get_contigs(bam):
    contigs = set()
    cmd = "samtools view %s" % bam
    sys.stderr.write("%s: %s\n" % (socket.gethostname(), cmd))
    p = Popen(cmd, stdout=PIPE, shell=True)
    for line in p.stdout:
        d = line.decode().split("\t")
        contigs.add(d[2])
    return contigs

In [248]:
contigs = []
for b in mapped_bam:
    contigs.append(get_contigs(b))

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_EW_36_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_HC_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_VM_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_YB_79_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT4_DS_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT4_SL_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT4_WT_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AV2_FP_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/go

In [249]:
contig_counts = Counter()
for contig in contigs:
    for c in contig: #j.r:
        contig_counts[c] += 1

In [251]:
!mkdir mapped_fasta

In [250]:
with open("mapped_fasta/all_contigs.txt", "w") as o:
    for c in contig_counts:
        o.write("{}\n".format(c))

**make new assembly with only mapped contigs**

In [359]:
cd $test_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test


In [252]:
assembly = "/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/assembly/reference.fasta"

In [253]:
!seqtk subseq $assembly mapped_fasta/all_contigs.txt > mapped_fasta/mapped.fasta

**make contig bed files**  
Issue with sorting. Randomize contig bed

In [147]:
!mkdir bed_files 

In [255]:
with open("bed_files/contigs.bed", "w") as o:
    for rec in SeqIO.parse('mapped_fasta/mapped.fasta', "fasta"):
        o.write("%s\t%d\t%d\n" % (rec.name, 0, len(rec)))

In [258]:
lines = open('bed_files/contigs.bed').readlines()
random.shuffle(lines)
open('bed_files/contigs.bed', 'w').writelines(lines)

In [259]:
def create_split_beds(nodes, bed):
    lines = 0
    for line in open(bed):
        lines += 1
    print(lines, lines//nodes)
    per_bed = lines//nodes
    cmd = "split -a 3 -d -l %d %s bed_files/contig.bed." % (per_bed, bed)
    !$cmd
    #call(cmd.split())
create_split_beds(63, "bed_files/contigs.bed") #nodes -1 

61843 981


In [360]:
beds = !ls bed_files/contig.bed.*
beds = [os.path.abspath(x) for x in beds]

In [361]:
len(beds),beds[0]

(64,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/bed_files/contig.bed.000')

In [266]:
def create_parallel_bams(args):
    bam_file, bed_file = args
    num = bed_file.split(".")[-1]
    out = "%s.%s" % (bam_file, num)
    t = tempfile.NamedTemporaryFile(delete=False, dir="/tmp")
    cmd = "samtools view -L %s -b %s -o %s" % (bed_file, bam_file, t.name)
    call(cmd.split())
    shutil.copy(t.name, out)
    os.remove(t.name)
    return out

In [365]:
args = []
for bam in mapped_bam:
    for bed in beds:
        a = [bam, bed]
        args.append(a)

In [366]:
len(args), args[0]

(960,
 ['/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/good_bams/AT2_EW_36_mapped.bam',
  '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/bed_files/contig.bed.000'])

In [364]:
len(mapped_bam), len(mapped_bam)*len(beds)

(15, 960)

In [274]:
for a in args:
    create_parallel_bams(a)

**ploidy_list based on bed files** 

In [165]:
!mkdir ploidy

In [374]:
def make_ploidy_list(beds,bam_files):
    for bed in beds: 
        be = bed.split('.')[2]
        fai = pd.read_csv(bed,header=None,sep='\t')
        contigs = fai.iloc[:,0].tolist()
        with open("ploidy/ploidy.%s.txt" % (be), "w") as o:
            for c in contigs:
                for b in bam_files:
                    bam = b.split('/')[11]
                    ID = bam.split('_mapped.bam')[0] ### This too
                    ploidy = ID.split('_')[0]
                    ploidy = re.sub(r'\D','',ploidy)
                    #print(b)
                    #print(ploidy)
                    #o.write("good_bams/%s\t%s\n" % (bam,ploidy)) 
                    o.write("%s 0 100 %s %s\n" % (c,ID,ploidy))

In [375]:
make_ploidy_list(beds,mapped_bam)

### make run_fb for each bed file

In [172]:
!mkdir vcf

In [181]:
!mkdir shdir

In [298]:
def run_fb(bed):
    cmd = 'freebayes -b good_bams/*mapped.bam.%s --cnv-map ploidy/ploidy.%s.txt -f mapped_fasta/mapped.fasta -t bed_files/contig.bed.%s -v vcf/fb.%s.vcf -V -i -u'  % (bed,bed,bed,bed)
    return  cmd 

In [309]:
#without bed targets file
def run_fb(bed):
    cmd = 'freebayes -b good_bams/*mapped.bam.%s --cnv-map ploidy/ploidy.%s.txt -f mapped_fasta/mapped.fasta -v vcf/fb.%s.vcf -V -i -u'  % (bed,bed,bed)
    return  cmd 

In [310]:
cmds = []
for bed in beds: 
    be = bed.split('.')[2]
    cmds.append(run_fb((be)))

In [311]:
len(cmds),cmds[0]

(64,
 'freebayes -b good_bams/*mapped.bam.000 --cnv-map ploidy/ploidy.000.txt -f mapped_fasta/mapped.fasta -v vcf/fb.000.vcf -V -i -u')

In [312]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
#account = 'cpu-s1-bionres-0'
#partition = 'cpu-s1-bionres-0'
jobname = 'fb'
time = '1-00:00:00' #time limit 10 day
cpus = 1
mem_cpu = 8000
email = 'tfaske@nevada.unr.edu'

In [313]:
def write_fb_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,out_dir,cmds,beds):
    for i in range(0,len(cmds)):
        bed = beds[i].split('.')[2]
        with open("shdir/run_fb_%s.sh" % (bed), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name fb_%s
#SBATCH --output shdir/output_fb_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s \n
    
cd %s\n    
    
%s \n
""" % (account,partition,time,int(cpus),int(mem_cpu),bed,bed,email,out_dir,cmds[i]))
            

In [314]:
write_fb_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,test_dir,cmds,beds)

#### finds all fb slurm scripts and writes bash script to sbatch them

In [315]:
shfb_files = []
os.chdir('{}/{}'.format(test_dir, 'shdir'))
files = !find . -name '*.sh'
files = [os.path.abspath(x) for x in files]
for x in files:
        shfb_files.append(x)
shfb_files = sorted(shfb_files)

In [316]:
len(shfb_files),shfb_files[0]

(64,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/shdir/run_fb_000.sh')

In [317]:
def write_bash_fb_sh(sh_files):
    with open("run_bash_fb.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [329]:
cd $test_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test


In [319]:
write_bash_fb_sh(shfb_files)

# Run run_bash_fb_sh locally
    cd /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test 
    source activate py36
    bash run_bash_fb.sh

**concatnate vcf files**

In [331]:
vcfs = !ls vcf/*vcf | grep -v tbi | grep -v sorted
vcfs = [os.path.abspath(x) for x in vcfs]

In [338]:
for vcf in vcfs:
    vcf_gz = "%s.gz" % vcf
    !bgzip -c $vcf > {vcf_gz}
    !tabix {vcf_gz}

In [342]:
vcfs_gz = !ls vcf/*vcf.gz | grep -v tbi | grep -v sorted
vcfs_gz = [os.path.abspath(x) for x in vcfs_gz]

In [343]:
len(vcfs_gz),vcfs_gz[0]

(64,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/vcf/fb.000.vcf.gz')

In [344]:
with open("vcf/concat2.sh", "w") as o:
    o.write("bcftools concat --threads 50 -Oz -o fb_cat.vcf.gz {}\n".format(" ".join(vcfs_gz)))

**concatenate vcf files**    
    bash concat2.sh > out_cat.txt  

**sort concat file**  
    vcf-sort fb_cat.vcf.gz > fb_sorted.vcf.gz

## New strategy: Split bams and run different jobs, for real

SNP calling keeps failing. Split bams into x jobs and run   

**Steps:**   
    - get mapped regions
    - extract mapped regions to reference
    - make bed file 
    - split bed file in X based off number of jobs. 
    - split bams based off bed files 
    - make slurm script for each split bed file 
    - run all files 

In [390]:
cd $bam_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams


In [391]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [392]:
len(bam_files), bam_files[0]

(695,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_sorted.bam')

In [324]:
def index_bam(bam):
    cmd = "samtools index %s" % (bam)
    !$cmd
    return bam

In [325]:
def get_mapped(bam):
    out = "%s_mapped.bam" % bam.split("_sorted.bam")[0]
    if not os.path.exists(out):                                       
        cmd = "samtools view -b -F 4 %s > %s" % (bam, out)
        res = !$cmd
    index_bam(out)
    return bam

In [326]:
for b in bam_files:
    get_mapped(b)

In [393]:
mapped_bam = []
files = !find . -type f -name '*mapped.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    mapped_bam.append(x)
mapped_bam = sorted(mapped_bam)

In [394]:
len(mapped_bam), mapped_bam[0]

(695,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_mapped.bam')

**get contigs that mapped**

In [346]:
def get_contigs(bam):
    contigs = set()
    cmd = "samtools view %s" % bam
    sys.stderr.write("%s: %s\n" % (socket.gethostname(), cmd))
    p = Popen(cmd, stdout=PIPE, shell=True)
    for line in p.stdout:
        d = line.decode().split("\t")
        contigs.add(d[2])
    return contigs

In [347]:
contigs = []
for b in mapped_bam:
    contigs.append(get_contigs(b))

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_11_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_12_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_13_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_14_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_15_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_16_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_17_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_1_mapped.bam
login-1: sa

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_46_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_47_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_48_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_49_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_50_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_51_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_52_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EC_53_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_EW_31_mapped.bam
login-1: s

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_3_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_7_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OC_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OT_11_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_OT_12_mapped.bam
login-1: samtools

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_DS_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_1_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_284_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_285_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_286_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_3_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT4_SL_5_mapped.bam
login-1: samt

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_161_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_162_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_163_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_164_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_165_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_166_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_1_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_CV_3_mapped.bam
login-1

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_3_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_7_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_PT_9_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_SN_125_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV2_SN_126_mapped.bam
login-1: samtoo

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LM_7_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LM_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LM_9_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_10_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_11_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_12_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_13_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_14_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AV4_LS_15_mapped.bam
login-1: samt

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_3_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_7_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_HO_9_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_JC_10_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_JC_1_mapped.bam
login-1: samtools 

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_OW_9_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_1_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_2_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_3_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PD_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PM_1_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_PM_2_mapped.bam
login-1: samtools v

login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_4_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_5_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_6_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_7_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_8_mapped.bam
login-1: samtools view /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AW4_YV_9_mapped.bam


In [348]:
contig_counts = Counter()
for contig in contigs:
    for c in contig: #j.r:
        contig_counts[c] += 1

In [397]:
cd $snp_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall


In [350]:
!mkdir mapped_fasta

In [351]:
with open("mapped_fasta/all_contigs.txt", "w") as o:
    for c in contig_counts:
        o.write("{}\n".format(c))

**make new assembly with only mapped contigs**

In [376]:
assembly = "/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/assembly/reference.fasta"

In [377]:
!seqtk subseq $assembly mapped_fasta/all_contigs.txt > mapped_fasta/mapped.fasta

**make contig bed files**  
Issue with sorting. Randomize contig bed

In [398]:
!mkdir bed_files 

mkdir: cannot create directory ‘bed_files’: File exists


In [399]:
with open("bed_files/contigs.bed", "w") as o:
    for rec in SeqIO.parse('mapped_fasta/mapped.fasta', "fasta"):
        o.write("%s\t%d\t%d\n" % (rec.name, 0, len(rec)))

In [400]:
lines = open('bed_files/contigs.bed').readlines()
random.shuffle(lines)
open('bed_files/contigs.bed', 'w').writelines(lines)

In [401]:
def create_split_beds(nodes, bed):
    lines = 0
    for line in open(bed):
        lines += 1
    print(lines, lines//nodes)
    per_bed = lines//nodes
    cmd = "split -a 3 -d -l %d %s bed_files/contig.bed." % (per_bed, bed)
    !$cmd
    #call(cmd.split())
create_split_beds(63, "bed_files/contigs.bed") #nodes -1 

61970 983


In [402]:
beds = !ls bed_files/contig.bed.*
beds = [os.path.abspath(x) for x in beds]

In [403]:
len(beds),beds[0]

(64,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/bed_files/contig.bed.000')

In [404]:
def create_parallel_bams(args):
    bam_file, bed_file = args
    num = bed_file.split(".")[-1]
    out = "%s.%s" % (bam_file, num)
    t = tempfile.NamedTemporaryFile(delete=False, dir="/tmp")
    cmd = "samtools view -L %s -b %s -o %s" % (bed_file, bam_file, t.name)
    call(cmd.split())
    shutil.copy(t.name, out)
    os.remove(t.name)
    return out

In [405]:
args = []
for bam in mapped_bam:
    for bed in beds:
        a = [bam, bed]
        args.append(a)

In [406]:
len(args), args[0]

(44480,
 ['/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/good_bams/AT2_BB_10_mapped.bam',
  '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/bed_files/contig.bed.000'])

In [407]:
len(mapped_bam), len(mapped_bam)*len(beds)

(695, 44480)

In [None]:
for a in args:
    create_parallel_bams(a)

**ploidy_list based on bed files** 

In [165]:
!mkdir ploidy

In [275]:
def make_ploidy_list(beds,bam_files):
    for bed in beds: 
        be = bed.split('.')[2]
        fai = pd.read_csv(bed,header=None,sep='\t')
        contigs = fai.iloc[:,0].tolist()
        with open("ploidy/ploidy.%s.txt" % (be), "w") as o:
            for c in contigs:
                for b in bam_files:
                    bam = b.split('/')[11]
                    ID = bam.split('_sorted.bam')[0] ### This too
                    ploidy = ID.split('_')[0]
                    ploidy = re.sub(r'\D','',ploidy)
                    #print(b)
                    #print(ploidy)
                    #o.write("good_bams/%s\t%s\n" % (bam,ploidy)) 
                    o.write("%s 0 100 %s %s\n" % (c,ID,ploidy))

In [276]:
make_ploidy_list(beds,bam_files)

### make run_fb for each bed file

In [172]:
!mkdir vcf

In [181]:
!mkdir shdir

In [298]:
def run_fb(bed):
    cmd = 'freebayes -b good_bams/*mapped.bam.%s --cnv-map ploidy/ploidy.%s.txt -f mapped_fasta/mapped.fasta -t bed_files/contig.bed.%s -v vcf/fb.%s.vcf -V -i -u'  % (bed,bed,bed,bed)
    return  cmd 

In [309]:
#without bed targets file
def run_fb(bed):
    cmd = 'freebayes -b good_bams/*mapped.bam.%s --cnv-map ploidy/ploidy.%s.txt -f mapped_fasta/mapped.fasta -v vcf/fb.%s.vcf -V -i -u'  % (bed,bed,bed)
    return  cmd 

In [310]:
cmds = []
for bed in beds: 
    be = bed.split('.')[2]
    cmds.append(run_fb((be)))

In [311]:
len(cmds),cmds[0]

(64,
 'freebayes -b good_bams/*mapped.bam.000 --cnv-map ploidy/ploidy.000.txt -f mapped_fasta/mapped.fasta -v vcf/fb.000.vcf -V -i -u')

In [312]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
#account = 'cpu-s1-bionres-0'
#partition = 'cpu-s1-bionres-0'
jobname = 'fb'
time = '1-00:00:00' #time limit 10 day
cpus = 1
mem_cpu = 8000
email = 'tfaske@nevada.unr.edu'

In [313]:
def write_fb_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,out_dir,cmds,beds):
    for i in range(0,len(cmds)):
        bed = beds[i].split('.')[2]
        with open("shdir/run_fb_%s.sh" % (bed), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name fb_%s
#SBATCH --output shdir/output_fb_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s \n
    
cd %s\n    
    
%s \n
""" % (account,partition,time,int(cpus),int(mem_cpu),bed,bed,email,out_dir,cmds[i]))
            

In [314]:
write_fb_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,test_dir,cmds,beds)

#### finds all fb slurm scripts and writes bash script to sbatch them

In [315]:
shfb_files = []
os.chdir('{}/{}'.format(test_dir, 'shdir'))
files = !find . -name '*.sh'
files = [os.path.abspath(x) for x in files]
for x in files:
        shfb_files.append(x)
shfb_files = sorted(shfb_files)

In [316]:
len(shfb_files),shfb_files[0]

(64,
 '/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test/shdir/run_fb_000.sh')

In [317]:
def write_bash_fb_sh(sh_files):
    with open("run_bash_fb.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [318]:
cd $test_dir

/data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test


In [319]:
write_bash_fb_sh(shfb_files)

# Run run_bash_fb_sh locally
    cd /data/gpfs/assoc/denovo/tfaske/sagebrush/denovo/SNPcall/test 
    source activate py36
    bash run_bash_fb.sh