# Subset vcf file with outgroup for phylo   

### move over ERNA_final and pca_df from local and SUBSET 

#### Make 2 vcfs, one with everyone known. Another with everyone known minus hybrids


use preferred conda env  
**Packages needed**: vcftools, bgzip, tabix

In [1]:
import sys
import ipyparallel as ipp
import os
from os import environ
import gzip
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import glob
import re
import random

In [2]:
vcftools = "vcftools"
bcftools = "bcftools"
bgzip = "bgzip"
tabix = "tabix"

In [3]:
root = '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo'

In [4]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo


In [5]:
analysis_dir = os.path.join(root,'subset_vcf')

In [6]:
cd $analysis_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/subset_vcf


In [7]:
!cp ../filtering/good_snps.recode.vcf.gz . 

In [8]:
vcf_file = os.path.join(analysis_dir, "good_snps.recode.vcf.gz")
assert os.path.exists(vcf_file)
vcf_file

'/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/subset_vcf/good_snps.recode.vcf.gz'

### Make pop_id and join it with relavent info from pca_df and ERNA final

In [9]:
%load_ext rpy2.ipython

In [14]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/subset_vcf')

In [15]:
%%R
#create Pop_ID file 

indv<-read.table("../filtering/good_snps.recode.vcf.gz.012.indv",sep="\t")

Pop <- rep(NA,times=nrow(indv))
ID <- rep(NA,times=nrow(indv))
All <- rep(NA,times=nrow(indv))
for (i in 1:nrow(indv)){
  Pop[i] <- unlist(strsplit(as.character(indv$V1[i]),"_"))[2]
  ID[i] <- unlist(strsplit(as.character(indv$V1[i]),"_"))[3]
  All[i] <- as.character(indv$V1[i])
}
Pop_ID <- data.frame(Pop=Pop,ID=ID,All=All)
print(head(Pop_ID))
print(dim(Pop_ID))

#write.csv(Pop_ID,'Pop_ID.csv',row.names=F)

  Pop ID      All
1  AH 10 EN_AH_10
2  AH 11 EN_AH_11
3  AH 12 EN_AH_12
4  AH 13 EN_AH_13
5  AH 14 EN_AH_14
6  AH 15 EN_AH_15
[1] 589   3


In [16]:
%%R
ERNAfinal <- read.csv('ERNA_final_sample_list.csv')

Pop_ID_Sum <- left_join(Pop_ID,ERNAfinal)
Pop_ID_Sum$Ssp <- as.character(Pop_ID_Sum$Ssp)
Pop_ID_Sum$Variety <- as.character(Pop_ID_Sum$Variety)

### Add E.discoidea Ssp 
Pop_ID_Sum$Ssp[which(Pop_ID_Sum$Pop == 'ED')] <- Pop_ID_Sum$Variety[which(Pop_ID_Sum$Pop == 'ED')]

#print(dim(Pop_ID_Sum))
#print(head(Pop_ID_Sum))
#print(Pop_ID_Sum[which(Pop_ID_Sum$Pop == 'ED'),])
write.csv(Pop_ID_Sum,'Pop_ID_Sum.csv',row.names = F)

Joining, by = "Pop"


## Use subset / rarefied individuals to make 2 iqtree runs. One with and one without hybrids

### with hybrids (all of em)

In [25]:
%%R
## add pca_df info
pca_sub_df <- read.csv('pca_sub_df.csv')
Pop_ID_Sum <- read.csv('Pop_ID_Sum.csv')
#head(pca_sub_df)
#head(Pop_ID_Sum)

#select important info
pca_sub_df <- pca_sub_df[c('All','Lin')]

#join 
Pop_ID_sub <- left_join(Pop_ID_Sum,pca_sub_df)
Pop_ID_sub <- na.omit(Pop_ID_sub)
Pop_ID_sub$Lin <- as.character(Pop_ID_sub$Lin)

##add ED (outgroup)
Pop_ID_ED <- Pop_ID_Sum[which(Pop_ID_Sum$Pop == 'ED'),]
Pop_ID_ED$Lin <- 'outgroup'
Pop_ID_sub <- rbind(Pop_ID_sub,Pop_ID_ED)

print(nrow(pca_sub_df))
print(nrow(Pop_ID_sub))

#### Write it out! 
# write file with individuals to keep 
keep_sub <- data.frame(INDV=as.character(Pop_ID_sub$All))
write.table(keep_sub,'keep_sub.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_sub,'Pop_ID_sub.csv',row.names = F)

Joining, by = "All"
[1] 160
[1] 163


### Make vcf with everyone (including hybrids) but only has known infomation for var/ssp 

In [26]:
!$vcftools --gzvcf $vcf_file \
--max-missing 0.7 \
--maf 0.02 \
--recode \
--recode-INFO-all \
--keep 'keep_sub.txt' \
--out 'ERNA_sub'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_sub.txt
	--recode-INFO-all
	--maf 0.02
	--max-missing 0.7
	--out ERNA_sub
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 163 out of 589 Individuals
Outputting VCF file...
After filtering, kept 18549 out of a possible 23306 Sites
Run Time = 11.00 seconds


In [27]:
vcf_sub = 'ERNA_sub.recode.vcf'
vcf_sub_gz = vcf_sub + ".gz"
!$bgzip -c {vcf_sub} > {vcf_sub_gz}
!$tabix {vcf_sub_gz}

In [28]:
!rm 'ERNA_sub.log'
!rm 'ERNA_sub.recode.vcf'
!rm 'ERNA_sub.recode.vcf.gz.tbi'

### Make vcf sub no hybrids

In [32]:
%%R
Pop_ID_sub <- read.csv('Pop_ID_sub.csv')
Pop_ID_sub_noHyb <- Pop_ID_sub[-which(Pop_ID_sub$Lin == 'hybrid'),]
print(dim(Pop_ID_sub_noHyb))

# write file with individuals to keep 
keep_sub_noHyb <- data.frame(INDV=as.character(Pop_ID_sub_noHyb$All))
write.table(keep_sub_noHyb,'keep_sub_noHyb.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_sub_noHyb,'Pop_ID_sub_noHyb.csv',row.names = FALSE)

[1] 131  11


In [36]:
!$vcftools --gzvcf $vcf_file \
--max-missing 0.7 \
--maf 0.02 \
--recode \
--recode-INFO-all \
--keep 'keep_sub_noHyb.txt' \
--out 'ERNA_sub_noHyb'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_sub_noHyb.txt
	--recode-INFO-all
	--maf 0.02
	--max-missing 0.7
	--out ERNA_sub_noHyb
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 131 out of 589 Individuals
Outputting VCF file...
After filtering, kept 18268 out of a possible 23306 Sites
Run Time = 10.00 seconds


In [37]:
vcf_sub_noHyb = 'ERNA_sub_noHyb.recode.vcf'
vcf_sub_noHyb_gz = vcf_sub_noHyb + ".gz"
!$bgzip -c {vcf_sub_noHyb} > {vcf_sub_noHyb_gz}
!$tabix {vcf_sub_noHyb_gz}

In [38]:
!rm 'ERNA_sub_noHyb.log'
!rm 'ERNA_sub_noHyb.recode.vcf'
!rm 'ERNA_sub_noHyb.recode.vcf.gz.tbi'

# Run IQtree

### vcf sub (with hybrids)

In [39]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo


In [40]:
!mkdir iqtree

In [41]:
iqtree_dir = os.path.join(root,'iqtree')

In [42]:
!cp 'subset_vcf/ERNA_sub.recode.vcf.gz' $iqtree_dir

In [43]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/iqtree


In [44]:
!gunzip 'ERNA_sub.recode.vcf.gz'

In [45]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [46]:
vcf_sub = os.path.join(iqtree_dir,'ERNA_sub.recode.vcf')

In [47]:
!python $vcf2phy --i $vcf_sub


Converting file '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/iqtree/ERNA_sub.recode.vcf':

Number of samples in VCF: 163
Total of genotypes processed: 18549
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 18549

Sample 1 of 163, 'EN_AR_2', added to the nucleotide matrix(ces).
Sample 2 of 163, 'EN_AR_3', added to the nucleotide matrix(ces).
Sample 3 of 163, 'EN_AR_4', added to the nucleotide matrix(ces).
Sample 4 of 163, 'EN_AR_5', added to the nucleotide matrix(ces).
Sample 5 of 163, 'EN_AR_6', added to the nucleotide matrix(ces).
Sample 6 of 163, 'EN_AS_18', added to the nucleotide matrix(ces).
Sample 7 of 163, 'EN_AS_20', added to the nucleotide matrix(ces).
Sample 8 of 163, 'EN_AS_23', added to the nucleotide matrix(ces).
Sample 9 of 163, 'EN_AS_24', added to the nucleotide matrix(ces).
Sample 10 of 163, 'EN_AS_28', added to the nucleo

Sample 127 of 163, 'EN_RL_6', added to the nucleotide matrix(ces).
Sample 128 of 163, 'EN_RS_1', added to the nucleotide matrix(ces).
Sample 129 of 163, 'EN_RS_2', added to the nucleotide matrix(ces).
Sample 130 of 163, 'EN_RS_4', added to the nucleotide matrix(ces).
Sample 131 of 163, 'EN_RS_5', added to the nucleotide matrix(ces).
Sample 132 of 163, 'EN_RS_6', added to the nucleotide matrix(ces).
Sample 133 of 163, 'EN_RS_7', added to the nucleotide matrix(ces).
Sample 134 of 163, 'EN_RT_1', added to the nucleotide matrix(ces).
Sample 135 of 163, 'EN_RT_2', added to the nucleotide matrix(ces).
Sample 136 of 163, 'EN_SJ_10', added to the nucleotide matrix(ces).
Sample 137 of 163, 'EN_SJ_12', added to the nucleotide matrix(ces).
Sample 138 of 163, 'EN_SJ_1', added to the nucleotide matrix(ces).
Sample 139 of 163, 'EN_SJ_2', added to the nucleotide matrix(ces).
Sample 140 of 163, 'EN_SJ_8', added to the nucleotide matrix(ces).
Sample 141 of 163, 'EN_SJ_9', added to the nucleotide matrix

#### NOTE: ran once but invariant sites. rerun with new .phy file

In [58]:
#phy_file = os.path.join(iqtree_dir,'ERNA_sub.recode.min4.phy')
phy_file = os.path.join(iqtree_dir,'ERNA_sub.recode.min4.phy.varsites.phy')

In [59]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [60]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_sub_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree_sub
#SBATCH --output output_iqtree_sub.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [61]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)

# Run iqtree.sh
    cd /data/gpfs/home/tfaske/d/rabbit/full/phylo/iqtree
    source activate py311
    sbatch run_iqtree.sh

## Run IQtree (no hybrids)

### VCF sub no hybrids

In [62]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/iqtree


In [63]:
!cp '../subset_vcf/ERNA_sub_noHyb.recode.vcf.gz' .

In [64]:
!gunzip 'ERNA_sub_noHyb.recode.vcf.gz'

In [65]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [66]:
vcf_sub_noHyb = os.path.join(iqtree_dir,'ERNA_sub_noHyb.recode.vcf')

In [67]:
!python $vcf2phy --i $vcf_sub_noHyb


Converting file '/data/gpfs/assoc/denovo/tfaske/rabbit/full/REDO/phylo/iqtree/ERNA_sub_noHyb.recode.vcf':

Number of samples in VCF: 131
Total of genotypes processed: 18268
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 18268

Sample 1 of 131, 'EN_AR_2', added to the nucleotide matrix(ces).
Sample 2 of 131, 'EN_AR_3', added to the nucleotide matrix(ces).
Sample 3 of 131, 'EN_AR_4', added to the nucleotide matrix(ces).
Sample 4 of 131, 'EN_AR_5', added to the nucleotide matrix(ces).
Sample 5 of 131, 'EN_AR_6', added to the nucleotide matrix(ces).
Sample 6 of 131, 'EN_AS_18', added to the nucleotide matrix(ces).
Sample 7 of 131, 'EN_AS_20', added to the nucleotide matrix(ces).
Sample 8 of 131, 'EN_AS_23', added to the nucleotide matrix(ces).
Sample 9 of 131, 'EN_AS_24', added to the nucleotide matrix(ces).
Sample 10 of 131, 'EN_AS_28', added to the 

Sample 130 of 131, 'EN_VI_4', added to the nucleotide matrix(ces).
Sample 131 of 131, 'EN_VI_5', added to the nucleotide matrix(ces).

PHYLIP matrix saved to: ERNA_sub_noHyb.recode.min4.phy

Done!



#### NOTE: ran once but invariant sites. rerun with new .phy file

In [72]:
#phy_file = os.path.join(iqtree_dir,'ERNA_sub_noHyb.recode.min4.phy')
phy_file = os.path.join(iqtree_dir,'ERNA_sub_noHyb.recode.min4.phy.varsites.phy')

In [73]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [74]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_sub_noHyb_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree_sub_noHyb
#SBATCH --output output_iqtree_sub_noHyb.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [75]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)