## ANGSD

See **angsdWrap_ngsF** for installation

**NOTE**: Should run this first to set up all files

In [1]:
import sys
import ipyparallel as ipp
import os, time
import pandas as pd

In [2]:
root = '/data/gpfs/assoc/denovo/PHHA/'

In [3]:
cd $root

/data/gpfs/assoc/denovo/PHHA


In [4]:
!mkdir angsd

In [5]:
analysis_dir = os.path.join(root,'angsd')

In [6]:
bam_dir = os.path.join(analysis_dir,'bam_files')

In [7]:
cd $analysis_dir

/data/gpfs/assoc/denovo/PHHA/angsd


In [8]:
!mkdir diversity

# Calculating nt diversity (pi)   

Move Pop_ID_Sum over to this directory  
must split vcf by each population  

In [9]:
Pop_ID = pd.read_csv(os.path.join(analysis_dir, "Pop_ID_Sum.csv"), sep=",")
Pop_ID.head()

Unnamed: 0,Pop,ID,All,Site,Latitude,Longitude
0,AS,10,PH_AS_10,Asotin,46.2621,-117.2985
1,AS,11,PH_AS_11,Asotin,46.2621,-117.2985
2,AS,12,PH_AS_12,Asotin,46.2621,-117.2985
3,AS,1,PH_AS_1,Asotin,46.2621,-117.2985
4,AS,2,PH_AS_2,Asotin,46.2621,-117.2985


In [10]:
Bam = []
for i in range(0,len(Pop_ID)):
    name = [bam_dir,"/",Pop_ID.All.loc[i],"_sorted.bam"]
    #print(name)
    Bam.append("".join(name))
Pop_ID['Bam'] = Bam
Pop_ID.head()
#print(len(Bam),len(Pop_ID_df))

Unnamed: 0,Pop,ID,All,Site,Latitude,Longitude,Bam
0,AS,10,PH_AS_10,Asotin,46.2621,-117.2985,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
1,AS,11,PH_AS_11,Asotin,46.2621,-117.2985,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
2,AS,12,PH_AS_12,Asotin,46.2621,-117.2985,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
3,AS,1,PH_AS_1,Asotin,46.2621,-117.2985,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...
4,AS,2,PH_AS_2,Asotin,46.2621,-117.2985,/data/gpfs/assoc/denovo/PHHA/angsd/bam_files/P...


In [11]:
pops = Pop_ID.Pop.unique()
for i in range(0,len(pops)):
    pop = str(pops[i])
    names = Pop_ID[(Pop_ID.Pop == pop)].Bam
    #print(names)
    pop_file = analysis_dir + "/diversity/"+ pop + "_bam_name.txt"
    #print(pop_file)
    names.to_csv(path_or_buf=pop_file,sep='\n',header=False,index=False)

# Get sfs.idx #

In [24]:
!source activate angsdWrap

/usr/bin/sh: line 0: source: activate: file not found


In [25]:
div_dir = os.path.join(analysis_dir,'diversity')

In [26]:
cd $div_dir

/data/gpfs/assoc/denovo/tfaske/piper/PIRE/angsd/diversity


In [27]:
#create output dir
!mkdir output

In [29]:
assembly =  '/data/gpfs/assoc/denovo/PHHA/assembly/reference.fasta'
assembly

'/data/gpfs/assoc/denovo/tfaske/piper/PIRE/assembly/reference.fasta'

In [30]:
ntasks = 1
cpus = 16
time = '3-00:00:00'
mem_cpu = 2500
email = 'tfaske@nevada.unr.edu'
account = 'cpu-s5-denovo-0'
part = 'cpu-core-0'

In [33]:
def write_angsd_sh(account,part,ntasks,cpus,time,mem_cpu,email,div_dir,assembly,Pop_ID):
    with open("run_angsd.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name angsd
#SBATCH --output output_angsd.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

""" % (account,part,time,ntasks,cpus,mem_cpu,email))
        
        pops = Pop_ID.Pop.unique()
        for i in range(0,len(pops)):
            pop = str(pops[i])
            bam_file = os.path.join(div_dir,''.join([pop,'_bam_name.txt']))
            o.write("angsd -bam %s -doSaf 1 -anc %s -GL 1 -P %s -out output/%s\n\n" % (bam_file,assembly,cpus,pop))
        
        
        

In [34]:
write_angsd_sh(account,part,ntasks,cpus,time,mem_cpu,email,div_dir,assembly,Pop_ID)

### in terminal  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity  
source activate angsdWrap  
sbatch run_angsd.sh  

# Make real sfs

In [41]:
ntasks = 1
cpus = 16
time = '3-00:00:00'
mem_cpu = 2500
email = 'tfaske@nevada.unr.edu'
account = 'cpu-s5-denovo-0'
part = 'cpu-core-0'

In [51]:
def write_realSFS_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID):
    with open("run_realSFS.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name realSFS
#SBATCH --output output_realSFS.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s\n\n
""" % (account,part,time,ntasks,cpus,mem_cpu,email))
        
        pops = Pop_ID.Pop.unique()
        for i in range(0,len(pops)):
            pop = str(pops[i])
            o.write("realSFS output/%s.saf.idx -P %s > output/%s.sfs\n\n" % (pop,cpus,pop))

In [52]:
write_realSFS_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID)

#### in terminal  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity  
source activate angsdWrap  
sbatch run_realSFS.sh

# do theta

In [53]:
def write_doTheta_sh(account,part,ntasks,cpus,time,mem_cpu,email,assembly,Pop_ID):
    with open("run_doTheta.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name doTheta
#SBATCH --output output_doTheta.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s\n\n
""" % (account,part,time,ntasks,cpus,mem_cpu,email))
        
        pops = Pop_ID.Pop.unique()
        for i in range(0,len(pops)):
            pop = str(pops[i])
            o.write("angsd -bam %s_bam_name.txt -out output/%s -doThetas 1 -doSaf 1 -pest output/%s.sfs -anc %s -GL 1\n\n" % (pop,pop,pop,assembly))
        
        

In [54]:
write_doTheta_sh(account,part,ntasks,cpus,time,mem_cpu,email,assembly,Pop_ID)

#### in terminal  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity  
source activate angsdWrap  
sbatch run_doTheta.sh

# do stat

In [64]:
### do stat controls 
win = 10000
step = 10000

In [65]:
def write_doStat_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID,win,step):
    with open("run_doStat.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name doStat
#SBATCH --output output_doStat.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s\n\n
""" % (account,part,time,ntasks,cpus,mem_cpu,email))
        
        pops = Pop_ID.Pop.unique()
        for i in range(0,len(pops)):
            pop = str(pops[i])
            o.write("thetaStat do_stat output/%s.thetas.idx -win %d -step %d \n\n" % (pop,win,step))
        
        

In [66]:
write_doStat_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID,win,step)

#### in terminal  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity  
source activate angsdWrap  
sbatch run_doStat.sh

# thetaOut

In [67]:
def write_thetaOut_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID):
    with open("run_thetaOut.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks %d
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name thetaOut
#SBATCH --output output_thetaOut.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s\n\n
""" % (account,part,time,ntasks,cpus,mem_cpu,email))
        
        pops = Pop_ID.Pop.unique()
        for i in range(0,len(pops)):
            pop = str(pops[i])
            o.write("thetaStat print output/%s.thetas.idx > output/%s.theta_out\n\n" % (pop,pop))
        
        

In [68]:
write_thetaOut_sh(account,part,ntasks,cpus,time,mem_cpu,email,Pop_ID)

#### in terminal  
cd /data/gpfs/assoc/denovo/PHHA/angsd/diversity  
source activate angsdWrap  
sbatch run_thetaOut.sh

# OLD PI with VCF

In [10]:
def get_pi(filedir, prefix):
    pi_file = !ls {filedir}/{prefix}.sites.pi
    pi_df = pd.read_csv(pi_file[0], sep="\t")
    pi = pi_df.PI.mean()
    #print(pi)
    return pi

In [11]:
pi_array=[]
pops = Pop_ID_df.Pop.unique()
for i in range(0,len(pops)):
    pop = pops[i]
    pi = get_pi(analysis_dir,pop)
    pi_array.append(pi)
pop_pi = {'Pop':pops,'pi':pi_array}
pop_pi_df = pd.DataFrame(pop_pi)
pop_pi_df.head()

Unnamed: 0,Pop,pi
0,AS,0.264486
1,BM,0.278885
2,BV,0.27728
3,DC,0.272952
4,DH,0.275095


In [12]:
pop_pi_file = os.path.join(analysis_dir,'pop_pi.csv')
pop_pi_df.to_csv(path_or_buf=pop_pi_file,index=False)

# test

In [19]:
SS_theta = pd.read_csv(os.path.join(analysis_dir, "SS.theta_out"), sep="\t")

In [20]:
SS_theta.head()

Unnamed: 0,#Chromo,Pos,Watterson,Pairwise,thetaSingleton,thetaH,thetaL
0,dDocent_Contig_6,2,-inf,-72.711815,-102.585783,-74.791252,-73.287178
1,dDocent_Contig_6,3,-inf,-72.103607,-103.27893,-74.183044,-72.67897
2,dDocent_Contig_6,4,-inf,-71.733322,-103.27893,-73.812759,-72.308685
3,dDocent_Contig_6,5,-inf,-72.286049,-103.27893,-74.365486,-72.861412
4,dDocent_Contig_6,6,-inf,-72.545708,-103.27893,-74.625145,-73.121071


In [36]:
np.mean(np.exp(SS_theta.Pairwise))

0.019512152172408606

In [37]:
josh = pd.read_csv('/working/jahner/desert_bigQ/sam_sai/DB_254_angsd.theta_out',sep='\t')

In [40]:
np.mean(np.exp(josh.Pairwise))

0.0007926749313000031