# Mapping using bwa-mem

Was considering using bwa-mem2 (https://github.com/bwa-mem2/bwa-mem2). It's much faster but needs much more memory. We don't have a speed problem so not needed

use preferred conda env  
**Packages needed**: bwa-mem, samtools

In [31]:
import sys
import ipyparallel as ipp
import os
from os import environ
import gzip
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import glob
import re
import random

In [32]:
root = "/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO"

In [33]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [34]:
fq_dir = '/data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/fastq'

In [35]:
pwd

'/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO'

## Assembly

with dDocent c=.92, k1=8, k2=5

In [36]:
assembly = os.path.join(root,"assembly/reference.fasta")

In [37]:
!samtools faidx $assembly

## Actual Mapping 
 

In [38]:
fastq_files = []
files = !find $fq_dir -name '*fq.gz'
files = [os.path.abspath(x) for x in files]
for x in files:
    fastq_files.append(x)
fastq_files = sorted(fastq_files)

In [39]:
len(fastq_files),fastq_files[0]

(608,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/fastq/EN_AH_1.F.fq.gz')

In [40]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [41]:
# -k INT minimum seed length [19]
# -w INT band width for banded alignment [100]
# -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]
# -T INT minimum score to output [30]
# -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]

#@lview.remote()
def run_bwamem(args):
    import os, multiprocessing, socket
    cpus = 1
    assembly, fq, outdir = args
    ID = fq.split('/')[10] ### need to change this to match your ID 
    ID = ID.split('.F.')[0] ### This too 
    sam = os.path.join(outdir, "{}.sam".format(os.path.basename(ID)))
    bam = sam.replace('.sam','.bam')
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bwa_cmd = r"bwa mem -k 20 -w 100 -r 1.3 -T 30 -R '@RG\tID:%s\tLB:%s\tSM:%s\tPL:ILLUMINA' %s %s > %s" % (ID,ID,ID,assembly,fq,sam)
    s2b_cmd =  "samtools view -b %s -o %s\n\nsamtools sort -@ %s %s -o %s\n\nsamtools index %s\n\n" % (sam,bam,cpus,bam,bam_sorted,bam_sorted)                                                              
    return  bwa_cmd,s2b_cmd 

In [42]:
!mkdir SNPcall

mkdir: cannot create directory ‘SNPcall’: File exists


In [43]:
!mkdir SNPcall/bwa

mkdir: cannot create directory ‘SNPcall/bwa’: File exists


In [44]:
!mkdir SNPcall/bwa/shdir

mkdir: cannot create directory ‘SNPcall/bwa/shdir’: File exists


In [45]:
bwa_dir = os.path.join(root,"SNPcall/bwa/")
assert(bwa_dir)

In [46]:
### creates a list of commands for bwa-mem for each fastq file
res_bwa = []
res_s2b = []
for f in fastq_files:
    r1,r2 = run_bwamem((assembly, f, bwa_dir))
    res_bwa.append(r1)
    res_s2b.append(r2)

In [47]:
len(res_bwa),res_bwa[0]

(608,
 "bwa mem -k 20 -w 100 -r 1.3 -T 30 -R '@RG\\tID:EN_AH_1\\tLB:EN_AH_1\\tSM:EN_AH_1\\tPL:ILLUMINA' /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta /data/gpfs/assoc/denovo/tfaske/rabbit/full/demult/fastq/EN_AH_1.F.fq.gz > /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1.sam")

In [48]:
len(res_s2b),res_s2b[0]

(608,
 'samtools view -b /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1.sam -o /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1.bam\n\nsamtools sort -@ 1 /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1.bam -o /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1_sorted.bam\n\nsamtools index /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_1_sorted.bam\n\n')

In [49]:
cd $bwa_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa


#### Below selects options for slurm submission and is a function for creating a slurm script per fastq

In [50]:
fq_ID = [fq.split('/')[10].split('.F.')[0] for fq in fastq_files]

In [51]:
len(fq_ID), fq_ID[0]

(608, 'EN_AH_1')

In [52]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'bwa_ERNA'
time = '1-00:00:00' #time limit 1 day
cpus = 1
mem_cpu = 10000
email = 'tfaske@nevada.unr.edu'

In [53]:
def write_bwamem_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,fq_ID,bwa_cmds,s2b_cmds):
    #print(account)
    #print(partition)
    #print(cpus)
    #print(email)
    #print(cmds[0])
    for i in range(0,len(bwa_cmds)):
        with open("shdir/run_bwamem_%s.sh" % (fq_ID[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_bwamem
#SBATCH --output bwa/shdir/output_bwamem_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s \n\n
    
%s \n\n
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),fq_ID[i],fq_ID[i],email,bwa_cmds[i],s2b_cmds[i]))

In [54]:
write_bwamem_ind_sh(account,partition,time,jobname,cpus,mem_cpu,email,fq_ID,res_bwa,res_s2b)

#### finds all bwa slurm scripts and writes bash script to sbatch them

In [55]:
shbwa_files = []
files = !find ./shdir -name '*.sh'
files = [os.path.abspath(x) for x in files]
for x in files:
        shbwa_files.append(x)
shbwa_files = sorted(shbwa_files)

In [56]:
len(shbwa_files),shbwa_files[0]

(608,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/shdir/run_bwamem_EN_AH_1.sh')

In [57]:
def write_bash_bwamem_sh(sh_files):
    with open("SNPcall/run_bash_bwa.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [58]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [59]:
write_bash_bwamem_sh(shbwa_files)

# Run run_bash_bwamem_sh locally
    cd /data/gpfs/assoc/denovo/PHHA/SNPcall 
    source activate py36
    bash run_bash_bwa.sh
    
## Remove sam files and unsorted bams

       whatever, run in terminal. Move sorted bams, rm bam, mv back

# Calculates coverage from bam_files (slowwwww)

also outputs a file called "bam_coverage.csv" in the bam folder

In [60]:
samtools = "samtools"

In [61]:
cd $bwa_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa


In [62]:
bam_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    bam_files.append(x)
bam_files = sorted(bam_files)

len(bam_files), len(fastq_files)

(608, 608)

In [63]:
len(bam_files), bam_files[0]

(608,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_10_sorted.bam')

In [65]:
bam_names = []
cov_list = []
for i in range(0,len(bam_files)):
    bam = bam_files[i]
    #print(bam)
    b = bam.split('/')[11] #set this 
    #print (b)
    !$samtools depth -a $b > $'cov.txt'
    cov = pd.read_csv('cov.txt', sep="\t",header=None)
    coverage = sum(cov.iloc[:,2])/len(cov.index)
    name = b.split('.F')[0]
    #print(name)
    bam_names.append(name)
    cov_list.append(coverage)
    print(coverage,name,i)
cov_df = pd.DataFrame(bam=bam_names,coverage=cov_list)
cov_df.head()

33.4799924581908 EN_AH_10_sorted.bam 0
31.248327703434796 EN_AH_11_sorted.bam 1
36.36597352866205 EN_AH_12_sorted.bam 2
34.191136760141525 EN_AH_13_sorted.bam 3
22.804208145103292 EN_AH_14_sorted.bam 4
28.562291769019577 EN_AH_15_sorted.bam 5
26.799940367505656 EN_AH_1_sorted.bam 6
27.76155006674202 EN_AH_2_sorted.bam 7
24.255642599921444 EN_AH_3_sorted.bam 8
26.395131855271522 EN_AH_4_sorted.bam 9
22.57259840011183 EN_AH_5_sorted.bam 10
38.23904014004204 EN_AH_6_sorted.bam 11
35.936595234391106 EN_AH_7_sorted.bam 12
33.12567852690226 EN_AH_8_sorted.bam 13
41.662919243376066 EN_AH_9_sorted.bam 14
42.39765857100225 EN_AR_2_sorted.bam 15
48.21704278396774 EN_AR_3_sorted.bam 16
31.460433825019116 EN_AR_4_sorted.bam 17
39.031409975459354 EN_AR_5_sorted.bam 18
38.32397484072061 EN_AR_6_sorted.bam 19
32.42031419086576 EN_AS_16_sorted.bam 20
30.81671032169083 EN_AS_17_sorted.bam 21
19.201975388934283 EN_AS_18_sorted.bam 22
35.831036381442146 EN_AS_19_sorted.bam 23
34.24495134220264 EN_AS_20_s

30.251329866273394 EN_EW_2_sorted.bam 199
25.957277286451134 EN_EW_3_sorted.bam 200
27.698711267474664 EN_EW_4_sorted.bam 201
29.439172308184077 EN_EW_5_sorted.bam 202
34.40160588853456 EN_EW_6_sorted.bam 203
22.591372029944125 EN_EW_7_sorted.bam 204
24.49227657391473 EN_EW_8_sorted.bam 205
35.07383204381866 EN_EW_9_sorted.bam 206
32.22218783146182 EN_FR_10_sorted.bam 207
29.21566376822173 EN_FR_11_sorted.bam 208
29.224281467053785 EN_FR_12_sorted.bam 209
26.51556384861324 EN_FR_13_sorted.bam 210
22.74245347373556 EN_FR_14_sorted.bam 211
28.190352707492416 EN_FR_15_sorted.bam 212
29.89832019123266 EN_FR_1_sorted.bam 213
33.32201465093356 EN_FR_2_sorted.bam 214
28.601015437751478 EN_FR_3_sorted.bam 215
22.169666263547334 EN_FR_4_sorted.bam 216
33.467829352485516 EN_FR_5_sorted.bam 217
30.237601697001043 EN_FR_6_sorted.bam 218
37.09181725852599 EN_FR_7_sorted.bam 219
32.85162599212972 EN_FR_8_sorted.bam 220
36.69213725909926 EN_FR_9_sorted.bam 221
37.46220072487513 EN_GB_16_sorted.bam 22

26.065631609234366 EN_NO_3_sorted.bam 396
19.663610668865047 EN_NO_4_sorted.bam 397
33.217558658689384 EN_NO_5_sorted.bam 398
21.909351977350912 EN_NT_1_sorted.bam 399
36.18274583255291 EN_NT_2_sorted.bam 400
28.344817939340192 EN_NT_3_sorted.bam 401
29.02504899749151 EN_NT_4_sorted.bam 402
37.18881008920318 EN_NT_5_sorted.bam 403
20.350236574785235 EN_NV_1_sorted.bam 404
20.00422135068441 EN_NV_2_sorted.bam 405
36.74594708373559 EN_NV_3_sorted.bam 406
17.13570824463191 EN_NV_4_sorted.bam 407
10.14602100080356 EN_NV_5_sorted.bam 408
33.31840281751638 EN_NV_6_sorted.bam 409
32.76603948083883 EN_OO_1_sorted.bam 410
36.781102843428876 EN_OO_2_sorted.bam 411
18.038845574134434 EN_OO_3_sorted.bam 412
37.4155784977542 EN_OO_4_sorted.bam 413
23.34474535132579 EN_OO_5_sorted.bam 414
24.231587002601316 EN_OO_6_sorted.bam 415
27.620238657090802 EN_OT_1_sorted.bam 416
18.55310477930792 EN_OT_2_sorted.bam 417
46.14884550209451 EN_OT_3_sorted.bam 418
10.19312144481839 EN_OT_4_sorted.bam 419
16.0223

27.038930189804063 EN_VM_7_sorted.bam 593
21.139383965217664 EN_VM_8_sorted.bam 594
28.275166313976694 EN_VM_9_sorted.bam 595
12.296250379522975 EN_WA_1_sorted.bam 596
4.943331198256089 EN_WA_2_sorted.bam 597
18.676913630289874 EN_WA_3_sorted.bam 598
2.002800879221823 EN_WA_4_sorted.bam 599
7.764350400736191 EN_WA_5_sorted.bam 600
14.574201521814029 EN_WA_6_sorted.bam 601
12.446266083374285 EN_YL_1_sorted.bam 602
7.586398902529729 EN_YL_2_sorted.bam 603
32.01855606478844 EN_YL_3_sorted.bam 604
33.81392000447143 EN_YL_4_sorted.bam 605
25.803814325478104 EN_YL_5_sorted.bam 606
23.708791837822357 EN_YL_6_sorted.bam 607


TypeError: DataFrame.__init__() got an unexpected keyword argument 'bam'

In [73]:
cov_dict = {"bam":bam_names,'coverage':cov_list}
cov_df = pd.DataFrame(cov_dict)
cov_df.head()

608


Unnamed: 0,bam,coverage
0,EN_AH_10_sorted.bam,33.479992
1,EN_AH_11_sorted.bam,31.248328
2,EN_AH_12_sorted.bam,36.365974
3,EN_AH_13_sorted.bam,34.191137
4,EN_AH_14_sorted.bam,22.804208


In [74]:
cov_out = os.path.join(bwa_dir,'bam_coverage.csv')
cov_df.to_csv(path_or_buf=cov_out)

In [75]:
cov_df.coverage.describe()


count    608.000000
mean      29.982901
std        7.270156
min        1.716934
25%       26.127237
50%       31.109119
75%       34.631151
max       59.362666
Name: coverage, dtype: float64

In [76]:
len(cov_df.coverage)

608

# Keep bam files over some specificed amount of coverage

In [78]:
len(cov_df[cov_df.coverage > 9]),len(cov_df[cov_df.coverage >= 8]),len(cov_df[cov_df.coverage >= 5])

(600, 600, 604)

In [79]:
good_bam = cov_df.bam[cov_df.coverage >= 9]
good_bam = good_bam.tolist()
len(good_bam),good_bam[0]

(600, 'EN_AH_10_sorted.bam')

In [80]:
good_bam_files = []
for i in range(0,len(good_bam)):
    bam = bwa_dir + good_bam[i]
    good_bam_files.append(bam)
len(good_bam_files), good_bam_files[1]

(600,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_11_sorted.bam')

In [81]:
snp_dir = os.path.join(root,'SNPcall')

In [82]:
cd $snp_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall


In [83]:
!mkdir 'good_bams'

In [84]:
good_bam_dir = os.path.join(snp_dir,'good_bams')
assert good_bam_dir

#### Move files instead of copy to save space. DO NOT DELETE

In [89]:
for i in range(0,len(good_bam_files)):
    good_bam = str(good_bam_files[i])
    !mv $good_bam $good_bam_dir

In [90]:
cd $good_bam_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/good_bams


In [91]:
good_bam_cp_files = []
files = !find . -type f -name '*sorted.bam'
files = [os.path.abspath(x) for x in files if 'bam' in x]
for x in files:
    good_bam_cp_files.append(x)
good_bam_cp_files = sorted(good_bam_files)

In [92]:
len(good_bam_cp_files),good_bam_cp_files[0]

(600,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/SNPcall/bwa/EN_AH_10_sorted.bam')

### Continue on to 4snpcalling