## ANGSD

http://www.popgen.dk/angsd/index.php/ANGSD

See **angsdWrap_ngsF** for installation

#### This script will get you nucleotide diversity (pi and wattersons) and Tajimas D for each locus within each population

In [1]:
import sys
import ipyparallel as ipp
import os, time
import pandas as pd

In [2]:
root = '/data/gpfs/assoc/denovo/PHHA/'

In [3]:
cd $root

/data/gpfs/assoc/denovo/PHHA


In [4]:
!mkdir angsd

mkdir: cannot create directory ‘angsd’: File exists


In [5]:
analysis_dir = os.path.join(root,'angsd')

In [6]:
bam_dir = os.path.join(analysis_dir,'bam_files')

In [42]:
cd $analysis_dir

/data/gpfs/assoc/denovo/PHHA/angsd


In [8]:
!mkdir diversity

mkdir: cannot create directory ‘diversity’: File exists


## Create bam_list file from indv file and move to bam_files dir

### If ran ngsF first, no need to do this

In [44]:
indv = pd.read_csv("../SNPcall_tf/filtering/good_snps.recode.vcf.gz.012.indv",header=None,names=['All'])
indv.head()

Unnamed: 0,All
0,PH_AS_10
1,PH_AS_11
2,PH_AS_12
3,PH_AS_1
4,PH_AS_2


In [45]:
good_bams_dir = os.path.join(root_dir,'SNPcall_tf/good_bams/')

In [46]:
for a in indv['All']:
    gb = good_bams_dir + a + '_sorted.bam*' 
    !cp $gb $bam_dir

### Both assembly and bam files need to be indexed

#### if needed:  
reindex assembly with samtools (creates .fai file )  
reindex bam files with samtools as well (creates .bai file )  

In [17]:
assembly = os.path.join(root,'assembly/reference.fasta')
assembly

'/data/gpfs/assoc/denovo/PHHA/assembly/reference.fasta'

In [None]:
!samtools faidx $assembly

In [50]:
bam_files = !find $bam_dir -name '*sorted.bam'
len(bam_files),bam_files[0]

(245, '/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/PH_SH_2_sorted.bam')

In [51]:
for bam in bam_files:
    !samtools index $bam

In [52]:
#checks and makes sure all were done
bam_index = !find $bam_dir -name '*.bam'
assert bam_files == bam_index

## Create a bam_file for each population   

In [9]:
indv = pd.read_csv("../SNPcall_tf/filtering/good_snps.recode.vcf.gz.012.indv",header=None,names=['All'])
indv.head()

Unnamed: 0,All
0,PH_AS_10
1,PH_AS_11
2,PH_AS_12
3,PH_AS_1
4,PH_AS_2


In [13]:
indv['Pop'] = [str(x).split('_')[1] for x in indv['All']]
indv['ID'] = [str(x).split('_')[2] for x in indv['All']]
indv.head()

Unnamed: 0,All,Pop,ID
0,PH_AS_10,AS,10
1,PH_AS_11,AS,11
2,PH_AS_12,AS,12
3,PH_AS_1,AS,1
4,PH_AS_2,AS,2


In [14]:
Bam = []
for i in range(0,len(indv)):
    name = [bam_dir,"/",indv.All.loc[i],"_sorted.bam"]
    #print(name)
    Bam.append("".join(name))
indv['Bam'] = Bam
indv.head()
#print(len(Bam),len(Pop_ID_df))

Unnamed: 0,All,Pop,ID,Bam
0,PH_AS_10,AS,10,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
1,PH_AS_11,AS,11,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
2,PH_AS_12,AS,12,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
3,PH_AS_1,AS,1,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
4,PH_AS_2,AS,2,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...


In [43]:
cd $analysis_dir

/data/gpfs/assoc/denovo/PHHA/angsd


In [51]:
#make bamlist dir
div_dir = os.path.join(analysis_dir,'diversity')
bamlist_dir = os.path.join(div_dir,'bamlist/')

In [52]:
cd $div_dir

/data/gpfs/assoc/denovo/PHHA/angsd/diversity


In [53]:
!mkdir bamlist

mkdir: cannot create directory ‘bamlist’: File exists


In [54]:
pops = indv.Pop.unique()
for i in range(0,len(pops)):
    pop = str(pops[i])
    names = indv[(indv.Pop == pop)].Bam
    #print(names)
    pop_file = bamlist_dir + pop + "_bam_name.txt"
    #print(pop_file)
    names.to_csv(path_or_buf=pop_file,sep='\n',header=False,index=False)

# Angsd commands

See tutorial for steps: http://www.popgen.dk/angsd/index.php/ANGSD

### Create a bash file for each population that does every step

#### Steps:   
- doSaf  
- realSFS  
- doTheta  
- doStat  
- thetaOut

In [35]:
cd $div_dir

/data/gpfs/assoc/denovo/PHHA/angsd/diversity


In [40]:
#create output dir and shdir 
!mkdir output
!mkdir shdir

mkdir: cannot create directory ‘output’: File exists


In [37]:
### Needed for angsd_commands
pops = indv['Pop'].unique()
#len(pops)
assembly = os.path.join(root,'assembly/reference.fasta')
cpus = 8

# for do_stat: window and step. 1 and 1 for denovo
win = 1 
step = 1

In [81]:
doSaf_cmd = []
realSFS_cmd = []
doTheta_cmd = []
doStat_cmd = []
thetaOut_cmd = []
for i in range(0,len(pops)):
    pop = str(pops[i])
    bam_file = os.path.join(bamlist_dir,''.join([pop,'_bam_name.txt']))
    doSaf_cmd.append("angsd -bam %s -doSaf 1 -anc %s -GL 1 -P %s -out output/%s" % (bam_file,assembly,cpus,pop))
    realSFS_cmd.append("realSFS output/%s.saf.idx -P %s > output/%s.sfs" % (pop,cpus,pop))
    doTheta_cmd.append("angsd -bam %s -out output/%s -doThetas 1 -doSaf 1 -pest output/%s.sfs -anc %s -GL 1" % (bam_file,pop,pop,assembly))
    doStat_cmd.append("thetaStat do_stat output/%s.thetas.idx -win %d -step %d" % (pop,win,step))
    thetaOut_cmd.append("thetaStat print output/%s.thetas.idx > output/%s.theta_out" % (pop,pop))



In [83]:
len(doSaf_cmd),len(realSFS_cmd),len(doTheta_cmd),len(doStat_cmd),len(thetaOut_cmd),doTheta_cmd[0]

(26,
 26,
 26,
 26,
 26,
 'angsd -bam /data/gpfs/assoc/denovo/PHHA/angsd/diversity/bamlist/AS_bam_name.txt -out output/AS -doThetas 1 -doSaf 1 -pest output/AS.sfs -anc /data/gpfs/assoc/denovo/PHHA/assembly/reference.fasta -GL 1')

In [91]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
time = '6-00:00:00' #time limit 1 day
cpus = 8 #change to match above
mem_cpu = 5000
email = 'tfaske@nevada.unr.edu'

In [92]:
def write_angsd_sh(account,partition,time,cpus,mem_cpu,email,div_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd):
    for i in range(0,len(doSaf_cmd)):
        with open("shdir/run_angsd_%s.sh" % (pops[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_angsd
#SBATCH --output shdir/output_angsd_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s 

cd %s

#### Running angsd ####
%s \n

#### Running realSFS ####
%s \n

#### Running doTheta ####
%s \n

#### Running doStat ####
%s \n

#### Running thetaOut ####
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),pops[i],pops[i],email,div_dir,doSaf_cmd[i],realSFS_cmd[i],doTheta_cmd[i],doStat_cmd[i],thetaOut_cmd[i]))
            

In [93]:
write_angsd_sh(account,partition,time,cpus,mem_cpu,email,div_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd)


#### finds all bwa slurm scripts and writes bash script to sbatch them

In [94]:
files = !find ./shdir -name '*.sh'
sh_files = sorted([os.path.abspath(x) for x in files])

In [95]:
len(sh_files),sh_files[0]

(52, '/data/gpfs/assoc/denovo/PHHA/angsd/diversity/shdir/run_angsd_AS.sh')

In [96]:
def write_bash_angsd_sh(sh_files):
    with open("run_bash_angsd.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [97]:
cd $div_dir

/data/gpfs/assoc/denovo/PHHA/angsd/diversity


In [98]:
write_bash_angsd_sh(sh_files)

## Run run_bash_angsd_sh locally  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity   
source activate angsdWrap  
bash run_bash_angsd.sh  

## Extract diversity data with R

Files are too big to move to local computer. Summarize here

Uses functions: `angsd_summary` and `angsd_tajD` in Imports.R

In [99]:
%load_ext rpy2.ipython

In [100]:
%%R
library(tidyverse)

source('/data/gpfs/assoc/denovo/src/R/Imports.R')

setwd('/data/gpfs/assoc/denovo/PHHA/angsd/diversity')

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [101]:
%%R

theta_sum_df <- angsd_summary(path='/data/gpfs/assoc/denovo/PHHA/angsd/diversity/output',files.pattern = '*.theta_out')

#pi
t(theta_sum_df[,1:2])

#watterson
t(theta_sum_df[,c(1,5)])

R[write to console]: Loading required package: data.table

R[write to console]: data.table 1.14.6 using 32 threads (see ?getDTthreads).  Latest news: r-datatable.com

R[write to console]: 
Attaching package: ‘data.table’


R[write to console]: The following objects are masked from ‘package:dplyr’:

    between, first, last


R[write to console]: The following object is masked from ‘package:purrr’:

    transpose


R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console

R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[

R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]

R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: |

R[write to console]: |--------------------------

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AS"         "BA"         "BB"         "BC"         "CA"        
W   "0.01868944" "0.01913881" "0.02047853" "0.01491302" "0.01733835"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "CR"         "DA"         "FC"         "FV"         "GB"        
W   "0.02046611" "0.01007772" "0.01831547" "0.02078617" "0.01659286"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "KC"         "MC"         "ML"         "OF"         "PC"        
W   "0.01849977" "0.01650937" "0.01751851" "0.01802506" "0.02135223"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "PM"         "PR"         "PT"         "SB"         "SC"        
W   "0.01322099" "0.01504187" "0.01472543" "0.01863721" "0.01608657"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "SH"         "SL"         "TB"         "WC"         "WF"        
W   "0.01557181" "0.01210098" "0.0

In [103]:
%%R 

#pi
print(t(theta_sum_df[c('Pop','Pi')]))

#watterson
print(t(theta_sum_df[c('Pop','W')]))

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AS"         "BA"         "BB"         "BC"         "CA"        
Pi  "0.01425889" "0.01544290" "0.01485795" "0.01372881" "0.01415734"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "CR"         "DA"         "FC"         "FV"         "GB"        
Pi  "0.01572638" "0.01011757" "0.01453581" "0.01473799" "0.01428040"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "KC"         "MC"         "ML"         "OF"         "PC"        
Pi  "0.01499822" "0.01381682" "0.01451769" "0.01488879" "0.01511194"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "PM"         "PR"         "PT"         "SB"         "SC"        
Pi  "0.01165877" "0.01349614" "0.01407471" "0.01486648" "0.01387172"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "SH"         "SL"         "TB"         "WC"         "WF"        
Pi  "0.01398081" "0.01185571" "0.0

In [104]:
%%R

tajD_df <- angsd_tajD(path='/data/gpfs/assoc/denovo/PHHA/angsd/diversity/output',files.pattern = '*pestPG')

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: |

R[write to con

R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to con

R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ==
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to conso

R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: |

R[write to console]: |----------------------------------

R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: =
R[write to console]: ==
R[write to console]: =
R[write to console]: =
R[write to conso

In [105]:
%%R 

head(tajD_df)

  Pop        TajD    TajD_low   TajD_high
1  AS -0.10762626 -0.10777177 -0.10748074
2  BA -0.10530514 -0.10544294 -0.10516735
3  BB -0.12731128 -0.12746031 -0.12716226
4  BC -0.07341377 -0.07353859 -0.07328896
5  CA -0.10586667 -0.10599833 -0.10573502
6  CR -0.11023757 -0.11039135 -0.11008379
