## ANGSD

http://www.popgen.dk/angsd/index.php/ANGSD

See **angsdWrap_ngsF** for installation

#### This script will get you nucleotide diversity (pi and wattersons) and Tajimas D for each locus within each population.

#### Use Pop_ID_Sum to split this into three groups: green, grey, hybrids. Only keep if more then 2-3 individuals

In [1]:
import sys
import ipyparallel as ipp
import os, time
import pandas as pd

In [2]:
root = '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/'

In [3]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO


In [4]:
!mkdir angsd

mkdir: cannot create directory ‘angsd’: File exists


In [5]:
analysis_dir = os.path.join(root,'angsd')

In [6]:
bam_dir = os.path.join(analysis_dir,'bam_files')

cd $analysis_dir

In [8]:
!mkdir diversity

In [32]:
div_dir = os.path.join(analysis_dir,'diversity')

In [10]:
!mkdir diversity/green
!mkdir diversity/grey
!mkdir diversity/hybrid

## Create a bam_file for each Pop_Lin combo, n > 2 (at least 3 per pop)

In [13]:
%load_ext rpy2.ipython

In [52]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity')

Pop_ID_Sum <- read.csv('Pop_ID_Sum.csv')

### add bam file name
Pop_ID_Sum$bam <- paste0('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/bam_files/',Pop_ID_Sum$All,'_sorted.bam')


In [53]:
%%R
### split into green
Pop_ID_green <- Pop_ID_Sum[which(Pop_ID_Sum$Lin == 'green'),]
pop_green <- Pop_ID_green %>%
                group_by(Pop) %>%
                summarize(n=n())
pop_green3 <- as.character(pop_green$Pop[which(pop_green$n > 2)])
Pop_green_df <- Pop_ID_green[which(Pop_ID_green$Pop %in% pop_green3),]
print('green')
print(length(pop_green3))
print(nrow(Pop_green_df))

### green Pop_bam_out
for (p in pop_green3){
    bams <- as.character(Pop_green_df$bam[which(Pop_green_df==p)])
    df <- data.frame(bam=bams)
    df <- na.omit(df)
    #print(df)
    outfile <- paste0('green/',p,'_bam_name.txt')
    write.table(df,file=outfile,col.names=FALSE,row.names=FALSE,quote=FALSE)
}

[1] "green"
[1] 31
[1] 179


In [54]:
%%R

### split into grey
Pop_ID_grey <- Pop_ID_Sum[which(Pop_ID_Sum$Lin == 'grey'),]
pop_grey <- Pop_ID_grey %>%
                group_by(Pop) %>%
                summarize(n=n())
pop_grey3 <- as.character(pop_grey$Pop[which(pop_grey$n > 2)])
Pop_grey_df <- Pop_ID_grey[which(Pop_ID_grey$Pop %in% pop_grey3),]
print('grey')
print(length(pop_grey3))
print(nrow(Pop_grey_df))

### grey Pop_bam_out
for (p in pop_grey3){
    bams <- as.character(Pop_grey_df$bam[which(Pop_grey_df==p)])
    df <- data.frame(bam=bams)
    df <- na.omit(df)
    #print(df)
    outfile <- paste0('grey/',p,'_bam_name.txt')
    write.table(df,file=outfile,col.names=FALSE,row.names=FALSE,quote=FALSE)
}

[1] "grey"
[1] 27
[1] 276


In [55]:
%%R

### split into hybrid
Pop_ID_hybrid <- Pop_ID_Sum[which(Pop_ID_Sum$Lin == 'hybrid'),]
pop_hybrid <- Pop_ID_hybrid %>%
                group_by(Pop) %>%
                summarize(n=n())
pop_hybrid3 <- as.character(pop_hybrid$Pop[which(pop_hybrid$n > 2)])
Pop_hybrid_df <- Pop_ID_hybrid[which(Pop_ID_hybrid$Pop %in% pop_hybrid3),]
print('hybrid')
print(length(pop_hybrid3))
print(nrow(Pop_hybrid_df))

### hybrid Pop_bam_out
for (p in pop_hybrid3){
    bams <- as.character(Pop_hybrid_df$bam[which(Pop_hybrid_df==p)])
    df <- data.frame(bam=bams)
    df <- na.omit(df)
    #print(df)
    outfile <- paste0('hybrid/',p,'_bam_name.txt')
    write.table(df,file=outfile,col.names=FALSE,row.names=FALSE,quote=FALSE)
}

[1] "hybrid"
[1] 21
[1] 115


# Angsd commands

See tutorial for steps: http://www.popgen.dk/angsd/index.php/ANGSD

### Create a bash file for each population that does every step

#### Steps:   
- doSaf  
- realSFS  
- doTheta  
- doStat  
- thetaOut

## GREEN 

In [72]:
green_dir = os.path.join(div_dir,'green')

In [73]:
cd $green_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green


In [74]:
#create output dir and shdir 
!mkdir output
!mkdir shdir

In [75]:
#Get pops
bam_names = !find $green_dir -name '*_bam_name.txt'
pops = sorted([b.split('/')[12].split('_')[0] for b in bam_names.list])
len(pops),pops[:2]

(31, ['AH', 'AR'])

In [76]:
### Needed for angsd_commands
assembly = os.path.join(root,'assembly/reference.fasta')
cpus = 8

# for do_stat: window and step. 1 and 1 for denovo
win = 1 
step = 1

In [77]:
doSaf_cmd = []
realSFS_cmd = []
doTheta_cmd = []
doStat_cmd = []
thetaOut_cmd = []
for i in range(0,len(pops)):
    pop = str(pops[i])
    bam_file = os.path.join(green_dir,''.join([pop,'_bam_name.txt']))
    doSaf_cmd.append("angsd -bam %s -doSaf 1 -anc %s -GL 1 -P %s -out output/%s" % (bam_file,assembly,cpus,pop))
    realSFS_cmd.append("realSFS output/%s.saf.idx -P %s > output/%s.sfs" % (pop,cpus,pop))
    doTheta_cmd.append("angsd -bam %s -out output/%s -doThetas 1 -doSaf 1 -pest output/%s.sfs -anc %s -GL 1" % (bam_file,pop,pop,assembly))
    doStat_cmd.append("thetaStat do_stat output/%s.thetas.idx -win %d -step %d" % (pop,win,step))
    thetaOut_cmd.append("thetaStat print output/%s.thetas.idx > output/%s.theta_out" % (pop,pop))



In [78]:
len(doSaf_cmd),len(realSFS_cmd),len(doTheta_cmd),len(doStat_cmd),len(thetaOut_cmd),doTheta_cmd[0]

(31,
 31,
 31,
 31,
 31,
 'angsd -bam /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green/AH_bam_name.txt -out output/AH -doThetas 1 -doSaf 1 -pest output/AH.sfs -anc /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta -GL 1')

In [88]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
time = '6-00:00:00' #time limit 1 day
cpus = 8 #change to match above
mem_cpu = 5000
email = 'tfaske@nevada.unr.edu'

In [89]:
def write_angsd_sh(account,partition,time,cpus,mem_cpu,email,green_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd):
    for i in range(0,len(doSaf_cmd)):
        with open("shdir/run_angsd_%s.sh" % (pops[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_angsd_green
#SBATCH --output shdir/output_angsd_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s 

cd %s

#### Running angsd ####
%s \n

#### Running realSFS ####
%s \n

#### Running doTheta ####
%s \n

#### Running doStat ####
%s \n

#### Running thetaOut ####
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),pops[i],pops[i],email,green_dir,doSaf_cmd[i],realSFS_cmd[i],doTheta_cmd[i],doStat_cmd[i],thetaOut_cmd[i]))
            

In [90]:
write_angsd_sh(account,partition,time,cpus,mem_cpu,email,green_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd)


#### finds all bwa slurm scripts and writes bash script to sbatch them

In [91]:
files = !find ./shdir -name '*.sh'
sh_files = sorted([os.path.abspath(x) for x in files])

In [92]:
len(sh_files),sh_files[0]

(31,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green/shdir/run_angsd_AH.sh')

In [93]:
def write_bash_angsd_sh(sh_files):
    with open("run_bash_angsd.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [94]:
cd $green_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green


In [86]:
write_bash_angsd_sh(sh_files)

## Run run_bash_angsd_sh locally  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity   
source activate angsdWrap  
bash run_bash_angsd.sh  

# REDO WITH SJ and ST split.

## Extract diversity data with R

Files are too big to move to local computer. Summarize here

Uses functions: `angsd_summary` and `angsd_tajD` in Imports.R

In [147]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [148]:
%%R
library(tidyverse)

source('/data/gpfs/assoc/denovo/src/R/Imports.R')

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green')

In [149]:
%%R

theta_sum_green_df <- angsd_summary(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green/output',files.pattern = '*.theta_out')

#pi
t(theta_sum_green_df[,1:2])

#watterson
t(theta_sum_green_df[,c(1,5)])


R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: =====
R[write t

R[write to console]: ====
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write 

R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ==
R[write to console]: |

R[write to console]: |-------------------------------------

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AH"         "AR"         "BC"         "BH"         "BO"        
W   "0.02025090" "0.01650532" "0.01565426" "0.01982400" "0.01769938"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "BT"         "CI"         "CN"         "CO"         "CS"        
W   "0.01928788" "0.01669881" "0.01925843" "0.01676807" "0.01836183"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "CT"         "DM"         "DT"         "EW"         "GO"        
W   "0.01715622" "0.01816314" "0.01942455" "0.01502725" "0.01882706"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "HW"         "IH"         "JO"         "JT"         "LO"        
W   "0.01765623" "0.02033441" "0.02171905" "0.01974009" "0.01693946"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "LT"         "MC"         "NI"         "NN"         "OT"        
W   "0.01632736" "0.02118216" "0.0

In [150]:
%%R 

#pi
print(t(theta_sum_green_df[c('Pop','Pi')]))

#watterson
print(t(theta_sum_green_df[c('Pop','W')]))

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AH"         "AR"         "BC"         "BH"         "BO"        
Pi  "0.01914637" "0.01771727" "0.01767748" "0.02000471" "0.01911809"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "BT"         "CI"         "CN"         "CO"         "CS"        
Pi  "0.01963878" "0.01835563" "0.01940563" "0.01824858" "0.01911290"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "CT"         "DM"         "DT"         "EW"         "GO"        
Pi  "0.01844869" "0.01903436" "0.01941024" "0.01678384" "0.01909598"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "HW"         "IH"         "JO"         "JT"         "LO"        
Pi  "0.01896168" "0.02021843" "0.02149963" "0.02048213" "0.01801144"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "LT"         "MC"         "NI"         "NN"         "OT"        
Pi  "0.01809320" "0.02017649" "0.0

In [151]:
%%R

tajD_green_df <- angsd_tajD(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/green/output',files.pattern = '*pestPG')

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[

R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: 

R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[

In [152]:
%%R 

head(tajD_green_df)

  Pop        TajD    TajD_low   TajD_high
1  AH -0.06999514 -0.07020598 -0.06978430
2  AR -0.02986420 -0.03004006 -0.02968834
3  BC -0.02168433 -0.02186890 -0.02149975
4  BH -0.04555476 -0.04574163 -0.04536788
5  BO -0.03201989 -0.03220046 -0.03183931
6  BT -0.05563072 -0.05580866 -0.05545278


## grey 

In [95]:
grey_dir = os.path.join(div_dir,'grey')

In [96]:
cd $grey_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey


In [97]:
#create output dir and shdir 
!mkdir output
!mkdir shdir

In [98]:
#Get pops
bam_names = !find $grey_dir -name '*_bam_name.txt'
pops = sorted([b.split('/')[12].split('_')[0] for b in bam_names.list])
len(pops),pops[:2]

(27, ['AS', 'BL'])

In [99]:
### Needed for angsd_commands
assembly = os.path.join(root,'assembly/reference.fasta')
cpus = 8

# for do_stat: window and step. 1 and 1 for denovo
win = 1 
step = 1

In [100]:
doSaf_cmd = []
realSFS_cmd = []
doTheta_cmd = []
doStat_cmd = []
thetaOut_cmd = []
for i in range(0,len(pops)):
    pop = str(pops[i])
    bam_file = os.path.join(grey_dir,''.join([pop,'_bam_name.txt']))
    doSaf_cmd.append("angsd -bam %s -doSaf 1 -anc %s -GL 1 -P %s -out output/%s" % (bam_file,assembly,cpus,pop))
    realSFS_cmd.append("realSFS output/%s.saf.idx -P %s > output/%s.sfs" % (pop,cpus,pop))
    doTheta_cmd.append("angsd -bam %s -out output/%s -doThetas 1 -doSaf 1 -pest output/%s.sfs -anc %s -GL 1" % (bam_file,pop,pop,assembly))
    doStat_cmd.append("thetaStat do_stat output/%s.thetas.idx -win %d -step %d" % (pop,win,step))
    thetaOut_cmd.append("thetaStat print output/%s.thetas.idx > output/%s.theta_out" % (pop,pop))



In [101]:
len(doSaf_cmd),len(realSFS_cmd),len(doTheta_cmd),len(doStat_cmd),len(thetaOut_cmd),doTheta_cmd[0]

(27,
 27,
 27,
 27,
 27,
 'angsd -bam /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey/AS_bam_name.txt -out output/AS -doThetas 1 -doSaf 1 -pest output/AS.sfs -anc /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta -GL 1')

In [102]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
time = '6-00:00:00' #time limit 1 day
cpus = 8 #change to match above
mem_cpu = 5000
email = 'tfaske@nevada.unr.edu'

In [103]:
def write_angsd_sh(account,partition,time,cpus,mem_cpu,email,grey_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd):
    for i in range(0,len(doSaf_cmd)):
        with open("shdir/run_angsd_%s.sh" % (pops[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_angsd_grey
#SBATCH --output shdir/output_angsd_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s 

cd %s

#### Running angsd ####
%s \n

#### Running realSFS ####
%s \n

#### Running doTheta ####
%s \n

#### Running doStat ####
%s \n

#### Running thetaOut ####
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),pops[i],pops[i],email,grey_dir,doSaf_cmd[i],realSFS_cmd[i],doTheta_cmd[i],doStat_cmd[i],thetaOut_cmd[i]))
            

In [104]:
write_angsd_sh(account,partition,time,cpus,mem_cpu,email,grey_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd)


#### finds all bwa slurm scripts and writes bash script to sbatch them

In [105]:
files = !find ./shdir -name '*.sh'
sh_files = sorted([os.path.abspath(x) for x in files])

In [106]:
len(sh_files),sh_files[0]

(27,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey/shdir/run_angsd_AS.sh')

In [107]:
def write_bash_angsd_sh(sh_files):
    with open("run_bash_angsd.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [108]:
cd $grey_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey


In [109]:
write_bash_angsd_sh(sh_files)

## Run run_bash_angsd_sh locally  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity   
source activate angsdWrap  
bash run_bash_angsd.sh  

## Extract diversity data with R

Files are too big to move to local computer. Summarize here

Uses functions: `angsd_summary` and `angsd_tajD` in Imports.R

In [132]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [133]:
%%R
library(tidyverse)

source('/data/gpfs/assoc/denovo/src/R/Imports.R')

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey')

In [134]:
%%R

theta_sum_grey_df <- angsd_summary(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey/output',files.pattern = '*.theta_out')

#pi
t(theta_sum_grey_df[,1:2])

#watterson
t(theta_sum_grey_df[,c(1,5)])


R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to 

R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to cons

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AS"         "BL"         "BM"         "BV"         "CH"        
W   "0.02218398" "0.01883757" "0.02450951" "0.02430676" "0.02097361"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "CL"         "DC"         "DH"         "EW"         "FR"        
W   "0.01897683" "0.02465306" "0.02468973" "0.01816177" "0.02185087"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "HO"         "JC"         "LR"         "LV"         "MD"        
W   "0.02514253" "0.02311488" "0.01996095" "0.01841348" "0.02074507"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "NN"         "NV"         "PB"         "PL"         "PT"        
W   "0.01959437" "0.01988667" "0.01946209" "0.02169177" "0.02041457"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "PW"         "RL"         "SC"         "SH"         "SS"        
W   "0.02175640" "0.01774648" "0.0

In [135]:
%%R 

#pi
print(t(theta_sum_grey_df[c('Pop','Pi')]))

#watterson
print(t(theta_sum_grey_df[c('Pop','W')]))

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "AS"         "BL"         "BM"         "BV"         "CH"        
Pi  "0.02005471" "0.01915266" "0.02049413" "0.02039039" "0.02038584"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "CL"         "DC"         "DH"         "EW"         "FR"        
Pi  "0.01928618" "0.02046797" "0.02043881" "0.01879020" "0.01987967"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "HO"         "JC"         "LR"         "LV"         "MD"        
Pi  "0.02066899" "0.02020177" "0.01966564" "0.01873008" "0.01962575"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "NN"         "NV"         "PB"         "PL"         "PT"        
Pi  "0.01989750" "0.01991660" "0.01934247" "0.01982985" "0.01978865"
    [,21]        [,22]        [,23]        [,24]        [,25]       
Pop "PW"         "RL"         "SC"         "SH"         "SS"        
Pi  "0.02022831" "0.01880432" "0.0

In [136]:
%%R

tajD_grey_df <- angsd_tajD(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/grey/output',files.pattern = '*pestPG')

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[

R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write 

R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write

In [139]:
%%R 

tajD_grey_df

   Pop        TajD    TajD_low   TajD_high
1   AS -0.08832565 -0.08854156 -0.08810974
2   BL -0.04336788 -0.04355331 -0.04318245
3   BM -0.11112765 -0.11134612 -0.11090917
4   BV -0.10083518 -0.10106040 -0.10060996
5   CH -0.06222614 -0.06241039 -0.06204190
6   CL -0.04754004 -0.04772164 -0.04735844
7   DC -0.11295037 -0.11316983 -0.11273091
8   DH -0.11132936 -0.11155029 -0.11110844
9   EW -0.05170781 -0.05189343 -0.05152220
10  FR -0.08607718 -0.08628818 -0.08586617
11  HO -0.11647227 -0.11669350 -0.11625103
12  JC -0.09683591 -0.09705190 -0.09661992
13  LR -0.06653905 -0.06672255 -0.06635555
14  LV -0.05468768 -0.05488310 -0.05449226
15  MD -0.06952919 -0.06973903 -0.06931935
16  NN -0.04298774 -0.04313785 -0.04283764
17  NV -0.06576789 -0.06591738 -0.06561840
18  PB -0.06405908 -0.06423624 -0.06388191
19  PL -0.08420842 -0.08441928 -0.08399756
20  PT -0.06370850 -0.06389377 -0.06352322
21  PW -0.08643437 -0.08662682 -0.08624192
22  RL -0.04142277 -0.04159656 -0.04124898
23  SC -0.0

## hybrid 

In [110]:
hybrid_dir = os.path.join(div_dir,'hybrid')

In [111]:
cd $hybrid_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid


In [112]:
#create output dir and shdir 
!mkdir output
!mkdir shdir

In [113]:
#Get pops
bam_names = !find $hybrid_dir -name '*_bam_name.txt'
pops = sorted([b.split('/')[12].split('_')[0] for b in bam_names.list])
len(pops),pops[:2]

(21, ['BC', 'CC'])

In [114]:
### Needed for angsd_commands
assembly = os.path.join(root,'assembly/reference.fasta')
cpus = 8

# for do_stat: window and step. 1 and 1 for denovo
win = 1 
step = 1

In [115]:
doSaf_cmd = []
realSFS_cmd = []
doTheta_cmd = []
doStat_cmd = []
thetaOut_cmd = []
for i in range(0,len(pops)):
    pop = str(pops[i])
    bam_file = os.path.join(hybrid_dir,''.join([pop,'_bam_name.txt']))
    doSaf_cmd.append("angsd -bam %s -doSaf 1 -anc %s -GL 1 -P %s -out output/%s" % (bam_file,assembly,cpus,pop))
    realSFS_cmd.append("realSFS output/%s.saf.idx -P %s > output/%s.sfs" % (pop,cpus,pop))
    doTheta_cmd.append("angsd -bam %s -out output/%s -doThetas 1 -doSaf 1 -pest output/%s.sfs -anc %s -GL 1" % (bam_file,pop,pop,assembly))
    doStat_cmd.append("thetaStat do_stat output/%s.thetas.idx -win %d -step %d" % (pop,win,step))
    thetaOut_cmd.append("thetaStat print output/%s.thetas.idx > output/%s.theta_out" % (pop,pop))



In [116]:
len(doSaf_cmd),len(realSFS_cmd),len(doTheta_cmd),len(doStat_cmd),len(thetaOut_cmd),doTheta_cmd[0]

(21,
 21,
 21,
 21,
 21,
 'angsd -bam /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid/BC_bam_name.txt -out output/BC -doThetas 1 -doSaf 1 -pest output/BC.sfs -anc /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/assembly/reference.fasta -GL 1')

In [117]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
time = '6-00:00:00' #time limit 1 day
cpus = 8 #change to match above
mem_cpu = 5000
email = 'tfaske@nevada.unr.edu'

In [118]:
def write_angsd_sh(account,partition,time,cpus,mem_cpu,email,hybrid_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd):
    for i in range(0,len(doSaf_cmd)):
        with open("shdir/run_angsd_%s.sh" % (pops[i]), "w") as o:
            o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name %s_angsd_hybrid
#SBATCH --output shdir/output_angsd_%s.txt
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=%s 

cd %s

#### Running angsd ####
%s \n

#### Running realSFS ####
%s \n

#### Running doTheta ####
%s \n

#### Running doStat ####
%s \n

#### Running thetaOut ####
%s \n""" % (account,partition,time,int(cpus),int(mem_cpu),pops[i],pops[i],email,hybrid_dir,doSaf_cmd[i],realSFS_cmd[i],doTheta_cmd[i],doStat_cmd[i],thetaOut_cmd[i]))
            

In [119]:
write_angsd_sh(account,partition,time,cpus,mem_cpu,email,hybrid_dir,pops,doSaf_cmd,realSFS_cmd,doTheta_cmd,doStat_cmd,thetaOut_cmd)


#### finds all bwa slurm scripts and writes bash script to sbatch them

In [120]:
files = !find ./shdir -name '*.sh'
sh_files = sorted([os.path.abspath(x) for x in files])

In [121]:
len(sh_files),sh_files[0]

(21,
 '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid/shdir/run_angsd_BC.sh')

In [122]:
def write_bash_angsd_sh(sh_files):
    with open("run_bash_angsd.sh", "w") as o:
        o.write("""#!/usr/bin/env bash \n\n""")
        for f in sh_files:
            o.write("sbatch %s \n" % (f))    

In [123]:
cd $hybrid_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid


In [124]:
write_bash_angsd_sh(sh_files)

## Run run_bash_angsd_sh locally  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity   
source activate angsdWrap  
bash run_bash_angsd.sh  

## Extract diversity data with R

Files are too big to move to local computer. Summarize here

Uses functions: `angsd_summary` and `angsd_tajD` in Imports.R

In [140]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [141]:
%%R
library(tidyverse)

source('/data/gpfs/assoc/denovo/src/R/Imports.R')

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid')

In [142]:
%%R

theta_sum_hybrid_df <- angsd_summary(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid/output',files.pattern = '*.theta_out')

#pi
t(theta_sum_hybrid_df[,1:2])

#watterson
t(theta_sum_hybrid_df[,c(1,5)])


R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ===
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: =====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: =====
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ====
R[write to console]: ====
R[write to console]: ===
R[write to console]: ====
R[write to console]: ====
R[wri

R[write to console]: ====
R[write to console]: ====
R[write to console]: |



    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "BC"         "CC"         "CV"         "GB"         "HL"        
W   "0.01812147" "0.01861798" "0.01958155" "0.02419388" "0.01885664"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "IO"         "IT"         "LA"         "LV"         "NH"        
W   "0.01847381" "0.01905468" "0.01418515" "0.01605106" "0.01906341"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "NO"         "NT"         "OO"         "PT"         "PW"        
W   "0.02009293" "0.01992557" "0.02079626" "0.01818425" "0.02020988"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "RH"         "RO"         "SS"         "TO"         "UO"        
W   "0.02091030" "0.02081652" "0.01978274" "0.01856931" "0.01943934"
    [,21]       
Pop "VI"        
W   "0.01834420"


In [143]:
%%R 

#pi
print(t(theta_sum_hybrid_df[c('Pop','Pi')]))

#watterson
print(t(theta_sum_hybrid_df[c('Pop','W')]))

    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "BC"         "CC"         "CV"         "GB"         "HL"        
Pi  "0.01903230" "0.02062542" "0.01955071" "0.02077862" "0.01966023"
    [,6]         [,7]         [,8]         [,9]         [,10]       
Pop "IO"         "IT"         "LA"         "LV"         "NH"        
Pi  "0.01981787" "0.01940788" "0.01649728" "0.01817805" "0.01952831"
    [,11]        [,12]        [,13]        [,14]        [,15]       
Pop "NO"         "NT"         "OO"         "PT"         "PW"        
Pi  "0.01990171" "0.01974553" "0.02004433" "0.01951040" "0.01996630"
    [,16]        [,17]        [,18]        [,19]        [,20]       
Pop "RH"         "RO"         "SS"         "TO"         "UO"        
Pi  "0.02048423" "0.02006999" "0.02008316" "0.01912259" "0.01959260"
    [,21]       
Pop "VI"        
Pi  "0.01910111"
    [,1]         [,2]         [,3]         [,4]         [,5]        
Pop "BC"         "CC"         "CV"         "GB"     

In [144]:
%%R

tajD_hybrid_df <- angsd_tajD(path='/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/angsd/diversity/hybrid/output',files.pattern = '*pestPG')

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: ===
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: 

R[write to console]: ==
R[write to console]: =
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: |

R[write to console]: |--------------------------------------------------|
|
R[write to console]: =====
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: ==
R[write to console]: ==
R[write to console]: ===
R[write to console]: ==
R[write to console]: =

In [145]:
%%R 

head(tajD_hybrid_df)

  Pop         TajD     TajD_low   TajD_high
1  BC -0.037548238 -0.037728611 -0.03736787
2  CC -0.001881535 -0.002058871 -0.00170420
3  CV -0.056517963 -0.056699488 -0.05633644
4  GB -0.107557648 -0.107772101 -0.10734319
5  HL -0.038634579 -0.038807986 -0.03846117
6  IO -0.031256841 -0.031436588 -0.03107709
