# make new VCF for phylogenetics

### Need to do:     

- create bam files for ED outgroup  
- move bam files for only those kept in Pop_ID
- filter snps same as original file

In [5]:
import sys
import ipyparallel as ipp
import os
from os import environ
import gzip
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import glob
import re
import random

## Mapping of ED outgroup

In [1]:
root = "/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO"

In [2]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [3]:
pwd

'/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO'

## Assembly

In [6]:
assembly = os.path.join(root,"assembly/reference.fasta")

## Actual Mapping 
 

In [7]:
files = !find ../demult/outgroup_fq/ -name 'EN_ED*'
ED_fq_files = [os.path.abspath(x) for x in files]

In [8]:
len(ED_fq_files),ED_fq_files

(3,
 ['/data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/outgroup_fq/EN_ED_3.fastq.gz',
  '/data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/outgroup_fq/EN_ED_1.fastq.gz',
  '/data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/outgroup_fq/EN_ED_2.fastq.gz'])

In [9]:
phylo_dir = os.path.join(root,"phylo")

In [10]:
cd $phylo_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo


In [11]:
# -k INT minimum seed length [19]
# -w INT band width for banded alignment [100]
# -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]
# -T INT minimum score to output [30]
# -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]

#@lview.remote()
def run_bwamem(args):
    import os, multiprocessing, socket
    cpus = 1
    assembly, fq, outdir = args
    ID = fq.split('/')[10] ### need to change this to match your ID 
    ID = ID.split('.fastq.')[0] ### This too 
    sam = os.path.join(outdir, "{}.sam".format(os.path.basename(ID)))
    bam = sam.replace('.sam','.bam')
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bwa_cmd = r"bwa mem -k 20 -w 100 -r 1.3 -T 30 -R '@RG\tID:%s\tLB:%s\tSM:%s\tPL:ILLUMINA' %s %s > %s" % (ID,ID,ID,assembly,fq,sam)
    s2b_cmd =  "samtools view -b %s -o %s\n\nsamtools sort -@ %s %s -o %s\n\nsamtools index %s\n\n" % (sam,bam,cpus,bam,bam_sorted,bam_sorted)                                                              
    return  bwa_cmd,s2b_cmd 

In [12]:
!mkdir bam_files

In [13]:
!mkdir bam_files/shdir

In [14]:
bam_dir = os.path.join(phylo_dir,"bam_files")
assert(bam_dir)

In [15]:
### creates a list of commands for bwa-mem for each fastq file
res_bwa = []
res_s2b = []
for f in ED_fq_files:
    r1,r2 = run_bwamem((assembly, f, bam_dir))
    res_bwa.append(r1)
    res_s2b.append(r2)

In [16]:
len(res_bwa),res_bwa[0]

(3,
 "bwa mem -k 20 -w 100 -r 1.3 -T 30 -R '@RG\\tID:EN_ED_3\\tLB:EN_ED_3\\tSM:EN_ED_3\\tPL:ILLUMINA' /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta /data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/outgroup_fq/EN_ED_3.fastq.gz > /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3.sam")

In [17]:
len(res_s2b),res_s2b[0]

(3,
 'samtools view -b /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3.sam -o /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3.bam\n\nsamtools sort -@ 1 /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3.bam -o /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3_sorted.bam\n\nsamtools index /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files/EN_ED_3_sorted.bam\n\n')

In [18]:
cd $bam_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/bam_files


#### Below selects options for slurm submission and is a function for creating a slurm script per fastq

In [19]:
fq_ID = [fq.split('/')[10].split('.fastq.')[0] for fq in ED_fq_files]

In [21]:
len(fq_ID), fq_ID

(3, ['EN_ED_3', 'EN_ED_1', 'EN_ED_2'])

In [22]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'bwa_ED'
time = '1-00:00:00' #time limit 1 day
cpus = 1
mem_cpu = 10000
email = 'tfaske@nevada.unr.edu'

In [23]:
def write_bwamem_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,fq_ID,bwa_cmds,s2b_cmds):
    #print(account)
    #print(partition)
    #print(cpus)
    #print(email)
    #print(cmds[0])
    for i in range(0,len(bwa_cmds)):
        with open("shdir/run_bwamem_%s.sh" % (fq_ID[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_bwamem
#SBATCH --output bam_files/shdir/output_bwamem_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s \n\n
    
%s \n\n
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),fq_ID[i],fq_ID[i],email,bwa_cmds[i],s2b_cmds[i]))

In [24]:
write_bwamem_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,fq_ID,res_bwa,res_s2b)

### Move Pop_ID sorted bams to bam_files dir   

also, remove unnecessary sam/bams from ED 

In [39]:
cd $phylo_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [44]:
Pop_ID = pd.read_csv('Pop_ID_Sum.csv')
print(Pop_ID.shape)
Pop_ID.head()

(585, 10)


Unnamed: 0,Pop,ID,All,Name,Lat,Long,State,Elevation,Ssp,Variety
0,AH,10,EN_AH_10,Austin Hwy,39.600807,-117.159999,NV,1754,consilimis,oreophila
1,AH,11,EN_AH_11,Austin Hwy,39.600807,-117.159999,NV,1754,consilimis,oreophila
2,AH,12,EN_AH_12,Austin Hwy,39.600807,-117.159999,NV,1754,consilimis,oreophila
3,AH,13,EN_AH_13,Austin Hwy,39.600807,-117.159999,NV,1754,consilimis,oreophila
4,AH,14,EN_AH_14,Austin Hwy,39.600807,-117.159999,NV,1754,consilimis,oreophila


In [45]:
for a in Pop_ID['All']:
    bam_cp = '../SNPcall/good_bams/' + a + '_sorted.bam'
    !cp $bam_cp $bam_dir

In [46]:
!ls -l bam_files/*bam | wc -l 

588


# Variant calling using bcftools   

use preferred conda env  
**Packages needed**: bcftools

### Assembly

In [47]:
cd $phylo_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [48]:
bam_files = []
files = !find bam_files/ -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

In [49]:
len(bam_files), bam_files[0]

(588,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/bam_files/EN_AH_10_sorted.bam')

# Call snps

    

In [50]:
!mkdir vcf

In [52]:
vcf_dir = os.path.join(phylo_dir,"vcf")
assert(vcf_dir)

In [53]:
cd $vcf_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/vcf


#### make a bam_list

In [54]:
with open('bam_list.txt', "w") as o:
    for b in bam_files:
        o.write("%s\n" % (b))

In [62]:
### select options for slurm submission
#account = 'cpu-s1-bionres-0'
#partition = 'cpu-s1-bionres-0'
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'rabbit_bcftools'
time = '8-00:00:00' #time limit 2 day
cpus = 1
mem_cpu = 100000
email = 'tfaske@nevada.unr.edu'

In [63]:
def write_bcftools_sh(account,partition,time,cpus,mem_cpu,email,assembly):
        with open("run_bcftools.sh" , "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name rabbit_bcftools
#SBATCH --output output_bcftools.txt
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=%s \n\n
    
bcftools mpileup -a DP,AD,INFO/AD -C 50 -d 250 -f %s -q 30 -Q 20 -I -b bam_list.txt | bcftools call -v -m -f GQ -O z -o rabbit_phylo.vcf.gz
 \n""" % (account,partition,time,int(cpus),int(mem_cpu),email,assembly))

In [64]:
write_bcftools_sh(account,partition,time,cpus,mem_cpu,email,assembly)

# Run run_bcftools.sh locally
    cd /data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/vcf
    source activate py36
    bash run_bcftools.sh