# Subset vcf file with outgroup for phylo   

### move over ERNA_final and pca_df from local and SUBSET 

#### Make 2 vcfs, one with everyone known. Another with everyone known minus hybrids


use preferred conda env  
**Packages needed**: vcftools, bgzip, tabix

In [19]:
import sys
import ipyparallel as ipp
import os
from os import environ
import gzip
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import glob
import re
import random

In [20]:
vcftools = "vcftools"
bcftools = "bcftools"
bgzip = "bgzip"
tabix = "tabix"

In [21]:
root = '/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo'

In [22]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [23]:
analysis_dir = os.path.join(root,'subset_vcf')

In [24]:
cd $analysis_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf


In [25]:
!cp ../filtering/good_snps.recode.vcf.gz . 

In [26]:
vcf_file = os.path.join(analysis_dir, "good_snps.recode.vcf.gz")
assert os.path.exists(vcf_file)
vcf_file

'/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz'

### Make pop_id and join it with relavent info from pca_df and ERNA final

In [10]:
%load_ext rpy2.ipython

In [12]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf')

In [15]:
%%R
#create Pop_ID file 

indv<-read.table("../filtering/good_snps.recode.vcf.012.indv",sep="\t")

Pop <- rep(NA,times=nrow(indv))
ID <- rep(NA,times=nrow(indv))
All <- rep(NA,times=nrow(indv))
for (i in 1:nrow(indv)){
  Pop[i] <- unlist(strsplit(as.character(indv$V1[i]),"_"))[2]
  ID[i] <- unlist(strsplit(as.character(indv$V1[i]),"_"))[3]
  All[i] <- as.character(indv$V1[i])
}
Pop_ID <- data.frame(Pop=Pop,ID=ID,All=All)
print(head(Pop_ID))
print(dim(Pop_ID))

#write.csv(Pop_ID,'Pop_ID.csv',row.names=F)

  Pop ID      All
1  AH 10 EN_AH_10
2  AH 11 EN_AH_11
3  AH 12 EN_AH_12
4  AH 13 EN_AH_13
5  AH 14 EN_AH_14
6  AH 15 EN_AH_15
[1] 588   3


In [44]:
%%R
ERNAfinal <- read.csv('ERNA_final_sample_list.csv')

Pop_ID_Sum <- left_join(Pop_ID,ERNAfinal)
Pop_ID_Sum$Ssp <- as.character(Pop_ID_Sum$Ssp)
Pop_ID_Sum$Variety <- as.character(Pop_ID_Sum$Variety)

### Add E.discoidea Ssp 
Pop_ID_Sum$Ssp[which(Pop_ID_Sum$Pop == 'ED')] <- Pop_ID_Sum$Variety[which(Pop_ID_Sum$Pop == 'ED')]

#print(dim(Pop_ID_Sum))
#print(head(Pop_ID_Sum))
#print(Pop_ID_Sum[which(Pop_ID_Sum$Pop == 'ED'),])

R[write to console]: [1m[22mJoining, by = "Pop"



In [45]:
%%R

## add pca_df info
pca_df <- read.csv('pca_df.csv')

#head(pca_df)

#select important info
pca_df <- pca_df[c('All','A1','Color')]
names(pca_df)[c(2,3)] <- c('Anc','Lineage')

#join 
Pop_ID_Sum <- left_join(Pop_ID_Sum,pca_df)
Pop_ID_Sum$Lineage <- as.character(Pop_ID_Sum$Lineage)

#add lineage to ED 
Pop_ID_Sum$Lineage[which(Pop_ID_Sum$Pop == 'ED')] <- 'outgroup'

#### Write it out! 
write.csv(Pop_ID_Sum,'Pop_ID_Sum.csv',row.names = F)

R[write to console]: [1m[22mJoining, by = "All"



### Make vcf with everyone (including hybrids) but only has known infomation for var/ssp 

In [52]:
%%R
Pop_ID_known <- Pop_ID_Sum[-which(is.na(Pop_ID_Sum$Ssp)),]
print(dim(Pop_ID_known))

# write file with individuals to keep 
keep_all <- data.frame(INDV=as.character(Pop_ID_known$All))
write.table(keep_all,'keep_known.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_known,'Pop_ID_known.csv',row.names = FALSE)

[1] 513  12


In [53]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_known.txt' \
--out 'ERNA_known'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/home/tfaske/d/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_known.txt
	--recode-INFO-all
	--out ERNA_known
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 513 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 25.00 seconds


In [55]:
vcf_known = 'ERNA_known.recode.vcf'
vcf_known_gz = vcf_known + ".gz"
!$bgzip -c {vcf_known} > {vcf_known_gz}
!$tabix {vcf_known_gz}

In [56]:
!rm 'ERNA_known.log'
!rm 'ERNA_known.recode.vcf'
!rm 'ERNA_known.recode.vcf.gz.tbi'

### Make vcf with only pure subspecies AND only has known infomation for var/ssp 

In [59]:
%%R
Pop_ID_noHybrid <- Pop_ID_Sum[-which(is.na(Pop_ID_Sum$Ssp)),]
Pop_ID_noHybrid <- Pop_ID_noHybrid[-which(Pop_ID_noHybrid$Lineage == 'hybrid'),]
print(dim(Pop_ID_noHybrid))

# write file with individuals to keep 
keep_noHybrid <- data.frame(INDV=as.character(Pop_ID_noHybrid$All))
write.table(keep_noHybrid,'keep_noHybrid.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_noHybrid,'Pop_ID_noHybrid.csv',row.names = FALSE)

[1] 391  12


In [60]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_noHybrid.txt' \
--out 'ERNA_noHybrid'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/home/tfaske/d/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_noHybrid.txt
	--recode-INFO-all
	--out ERNA_noHybrid
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 391 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 20.00 seconds


In [61]:
vcf_noHybrid = 'ERNA_noHybrid.recode.vcf'
vcf_noHybrid_gz = vcf_noHybrid + ".gz"
!$bgzip -c {vcf_noHybrid} > {vcf_noHybrid_gz}
!$tabix {vcf_noHybrid_gz}

In [62]:
!rm 'ERNA_noHybrid.log'
!rm 'ERNA_noHybrid.recode.vcf'
!rm 'ERNA_noHybrid.recode.vcf.gz.tbi'

# Run IQtree

### for each pop_known, select 3 individuals and run iqtree

In [67]:
%%R 

Pop_ID_known <- read.csv('Pop_ID_known.csv')
#table(Pop_ID_known$Pop)
Pop_ID_known$Pop <- as.character(Pop_ID_known$Pop)

pop_known <- unique(Pop_ID_known$Pop)
Pop_ID_known_iqtree <- Pop_ID_known[1,]

for (p in pop_known){
    df <- Pop_ID_known[which(Pop_ID_known$Pop == p),]
    if(nrow(df) >= 3){
        index <- sample(1:nrow(df),3)
        df <- df[index,]
        Pop_ID_known_iqtree <- rbind(Pop_ID_known_iqtree,df)
    }else{
        Pop_ID_known_iqtree <- rbind(Pop_ID_known_iqtree,df)
    }
}
Pop_ID_known_iqtree <- Pop_ID_known_iqtree[-1,]

In [72]:
%%R 
# write file with individuals to keep 
keep_known_iqtree <- data.frame(INDV=as.character(Pop_ID_known_iqtree$All))
write.table(keep_known_iqtree,'keep_known_iqtree.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_known_iqtree,'Pop_ID_known_iqtree.csv',row.names = FALSE)

#print(length(unique(Pop_ID_known_iqtree$Pop)))
#table(Pop_ID_known_iqtree$Pop)
#print(nrow(Pop_ID_known_iqtree))
#table(Pop_ID_known$Pop)

In [73]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_known_iqtree.txt' \
--out 'ERNA_known_iqtree'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/home/tfaske/d/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_known_iqtree.txt
	--recode-INFO-all
	--out ERNA_known_iqtree
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 182 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 13.00 seconds


In [74]:
vcf_known_iqtree = 'ERNA_known_iqtree.recode.vcf'
vcf_known_iqtree_gz = vcf_known_iqtree + ".gz"
!$bgzip -c {vcf_known_iqtree} > {vcf_known_iqtree_gz}
!$tabix {vcf_known_iqtree_gz}

In [76]:
!rm 'ERNA_known_iqtree.log'
!rm 'ERNA_known_iqtree.recode.vcf'
!rm 'ERNA_known_iqtree.recode.vcf.gz.tbi'

## Run iqtree

In [97]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [98]:
!mkdir iqtree

In [124]:
iqtree_dir = os.path.join(root,'iqtree')

In [100]:
!cp 'subset_vcf/ERNA_known_iqtree.recode.vcf.gz' $iqtree_dir

In [125]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree


In [102]:
!gunzip 'ERNA_known_iqtree.recode.vcf.gz'

In [106]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [126]:
vcf_iqtree = os.path.join(iqtree_dir,'ERNA_known_iqtree.recode.vcf')

In [115]:
!python $vcf2phy --i $vcf_iqtree


Converting file '/data/gpfs/home/tfaske/d/rabbit/full/phylo/iqtree/ERNA_known_iqtree.recode.vcf':

Number of samples in VCF: 182
Total of genotypes processed: 23832
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 23832

Sample 1 of 182, 'EN_AH_10', added to the nucleotide matrix(ces).
Sample 2 of 182, 'EN_AH_15', added to the nucleotide matrix(ces).
Sample 3 of 182, 'EN_AH_3', added to the nucleotide matrix(ces).
Sample 4 of 182, 'EN_AR_4', added to the nucleotide matrix(ces).
Sample 5 of 182, 'EN_AR_5', added to the nucleotide matrix(ces).
Sample 6 of 182, 'EN_AR_6', added to the nucleotide matrix(ces).
Sample 7 of 182, 'EN_AS_18', added to the nucleotide matrix(ces).
Sample 8 of 182, 'EN_AS_22', added to the nucleotide matrix(ces).
Sample 9 of 182, 'EN_AS_26', added to the nucleotide matrix(ces).
Sample 10 of 182, 'EN_BC_12', added to the nucleot

Sample 121 of 182, 'EN_OO_2', added to the nucleotide matrix(ces).
Sample 122 of 182, 'EN_OO_4', added to the nucleotide matrix(ces).
Sample 123 of 182, 'EN_OO_6', added to the nucleotide matrix(ces).
Sample 124 of 182, 'EN_OT_1', added to the nucleotide matrix(ces).
Sample 125 of 182, 'EN_OT_3', added to the nucleotide matrix(ces).
Sample 126 of 182, 'EN_OT_4', added to the nucleotide matrix(ces).
Sample 127 of 182, 'EN_PB_2', added to the nucleotide matrix(ces).
Sample 128 of 182, 'EN_PB_3', added to the nucleotide matrix(ces).
Sample 129 of 182, 'EN_PB_5', added to the nucleotide matrix(ces).
Sample 130 of 182, 'EN_PL_16', added to the nucleotide matrix(ces).
Sample 131 of 182, 'EN_PL_22', added to the nucleotide matrix(ces).
Sample 132 of 182, 'EN_PL_26', added to the nucleotide matrix(ces).
Sample 133 of 182, 'EN_PT_13', added to the nucleotide matrix(ces).
Sample 134 of 182, 'EN_PT_1', added to the nucleotide matrix(ces).
Sample 135 of 182, 'EN_PT_4', added to the nucleotide matr

#### NOTE: ran once but invariant sites. rerun with new .phy file

In [151]:
phy_file = os.path.join(iqtree_dir,'ERNA_known_iqtree.recode.min4.phy.varsites.phy')

In [152]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [153]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_known_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree
#SBATCH --output output_iqtree.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [154]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)

# Run iqtree.sh
    cd /data/gpfs/home/tfaske/d/rabbit/full/phylo/iqtree
    source activate py311
    sbatch run_iqtree.sh

# Run IQtree (no hybrids)

### for each pop_noHybrid, select 3 individuals and run iqtree

In [2]:
%load_ext rpy2.ipython

In [4]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf')

In [5]:
%%R 

Pop_ID_noHybrid <- read.csv('Pop_ID_noHybrid.csv')
#table(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid$Pop <- as.character(Pop_ID_noHybrid$Pop)

pop_noHybrid <- unique(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid[1,]

for (p in pop_noHybrid){
    df <- Pop_ID_noHybrid[which(Pop_ID_noHybrid$Pop == p),]
    if(nrow(df) >= 3){
        index <- sample(1:nrow(df),3)
        df <- df[index,]
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }else{
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }
}
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid_iqtree[-1,]

In [9]:
%%R 

print(dim(Pop_ID_noHybrid_iqtree))
print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))

[1] 135  12
[1] 47


In [10]:
%%R 
# write file with individuals to keep 
keep_noHybrid_iqtree <- data.frame(INDV=as.character(Pop_ID_noHybrid_iqtree$All))
write.table(keep_noHybrid_iqtree,'keep_noHybrid_iqtree.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_noHybrid_iqtree,'Pop_ID_noHybrid_iqtree.csv',row.names = FALSE)

#print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))
#table(Pop_ID_noHybrid_iqtree$Pop)
#print(nrow(Pop_ID_noHybrid_iqtree))
#table(Pop_ID_noHybrid$Pop)

In [27]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_noHybrid_iqtree.txt' \
--out 'ERNA_noHybrid_iqtree'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_noHybrid_iqtree.txt
	--recode-INFO-all
	--out ERNA_noHybrid_iqtree
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 135 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 11.00 seconds


In [28]:
vcf_noHybrid_iqtree = 'ERNA_noHybrid_iqtree.recode.vcf'
vcf_noHybrid_iqtree_gz = vcf_noHybrid_iqtree + ".gz"
!$bgzip -c {vcf_noHybrid_iqtree} > {vcf_noHybrid_iqtree_gz}
!$tabix {vcf_noHybrid_iqtree_gz}

In [29]:
!rm 'ERNA_noHybrid_iqtree.log'
!rm 'ERNA_noHybrid_iqtree.recode.vcf'
!rm 'ERNA_noHybrid_iqtree.recode.vcf.gz.tbi'

## Run iqtree

In [30]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [31]:
!mkdir iqtree

mkdir: cannot create directory ‘iqtree’: File exists


In [32]:
iqtree_dir = os.path.join(root,'iqtree')

In [33]:
!cp 'subset_vcf/ERNA_noHybrid_iqtree.recode.vcf.gz' $iqtree_dir

In [34]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree


In [35]:
!gunzip 'ERNA_noHybrid_iqtree.recode.vcf.gz'

In [36]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [37]:
vcf_iqtree = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.vcf')

In [38]:
!python $vcf2phy --i $vcf_iqtree


Converting file '/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree/ERNA_noHybrid_iqtree.recode.vcf':

Number of samples in VCF: 135
Total of genotypes processed: 23832
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 23832

Sample 1 of 135, 'EN_AH_12', added to the nucleotide matrix(ces).
Sample 2 of 135, 'EN_AH_2', added to the nucleotide matrix(ces).
Sample 3 of 135, 'EN_AH_5', added to the nucleotide matrix(ces).
Sample 4 of 135, 'EN_AR_2', added to the nucleotide matrix(ces).
Sample 5 of 135, 'EN_AR_3', added to the nucleotide matrix(ces).
Sample 6 of 135, 'EN_AR_6', added to the nucleotide matrix(ces).
Sample 7 of 135, 'EN_AS_16', added to the nucleotide matrix(ces).
Sample 8 of 135, 'EN_AS_23', added to the nucleotide matrix(ces).
Sample 9 of 135, 'EN_AS_27', added to the nucleotide matrix(ces).
Sample 10 of 135, 'EN_BC_10', added to the

Sample 124 of 135, 'EN_UT_1', added to the nucleotide matrix(ces).
Sample 125 of 135, 'EN_UT_2', added to the nucleotide matrix(ces).
Sample 126 of 135, 'EN_UT_3', added to the nucleotide matrix(ces).
Sample 127 of 135, 'EN_VM_11', added to the nucleotide matrix(ces).
Sample 128 of 135, 'EN_VM_13', added to the nucleotide matrix(ces).
Sample 129 of 135, 'EN_VM_4', added to the nucleotide matrix(ces).
Sample 130 of 135, 'EN_WA_1', added to the nucleotide matrix(ces).
Sample 131 of 135, 'EN_WA_3', added to the nucleotide matrix(ces).
Sample 132 of 135, 'EN_WA_6', added to the nucleotide matrix(ces).
Sample 133 of 135, 'EN_YL_3', added to the nucleotide matrix(ces).
Sample 134 of 135, 'EN_YL_5', added to the nucleotide matrix(ces).
Sample 135 of 135, 'EN_YL_6', added to the nucleotide matrix(ces).

PHYLIP matrix saved to: ERNA_noHybrid_iqtree.recode.min4.phy

Done!



#### NOTE: ran once but invariant sites. rerun with new .phy file

In [43]:
#phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.min4.phy')
phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.min4.phy.varsites.phy')

In [44]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [45]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_noHybrid_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree
#SBATCH --output output_iqtree_noHybrid.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [46]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)

# Run IQtree (no hybrids / more individuals)

### for each pop_noHybrid, select  8 individuals and run iqtree

In [65]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [66]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf')

In [67]:
%%R 

Pop_ID_noHybrid <- read.csv('Pop_ID_noHybrid.csv')
table(Pop_ID_noHybrid$Pop)


AH AR AS BC BH BM BO BT BV CH CI CL CN CO CT DC DH DT ED EW FR GB GO HO JC LO 
15  5 15  7  5 14  6  6 15  6  5  6  5  5  6 14 15  6  3 13 15  1  6 14 15  5 
LR LT LV MD NI NV OT PB PL PT PW RL RO RS SC SJ SS UT VM WA YL 
 7  4 12 15  6  6  1  6 14  7  8  6  1  6 14 12 10  5 15  3  5 


In [69]:
%%R 

Pop_ID_noHybrid <- read.csv('Pop_ID_noHybrid.csv')
#table(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid$Pop <- as.character(Pop_ID_noHybrid$Pop)

pop_noHybrid <- unique(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid[1,]

for (p in pop_noHybrid){
    df <- Pop_ID_noHybrid[which(Pop_ID_noHybrid$Pop == p),]
    if(nrow(df) >= 9){
        index <- sample(1:nrow(df),8)
        df <- df[index,]
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }else{
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }
}
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid_iqtree[-1,]

In [70]:
%%R 

print(dim(Pop_ID_noHybrid_iqtree))
print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))

[1] 290  12
[1] 47


In [71]:
%%R 
# write file with individuals to keep 
keep_noHybrid_iqtree <- data.frame(INDV=as.character(Pop_ID_noHybrid_iqtree$All))
write.table(keep_noHybrid_iqtree,'keep_noHybrid8_iqtree.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_noHybrid_iqtree,'Pop_ID_noHybrid8_iqtree.csv',row.names = FALSE)

#print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))
#table(Pop_ID_noHybrid_iqtree$Pop)
#print(nrow(Pop_ID_noHybrid_iqtree))
#table(Pop_ID_noHybrid$Pop)

In [72]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_noHybrid8_iqtree.txt' \
--out 'ERNA_noHybrid8_iqtree'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_noHybrid8_iqtree.txt
	--recode-INFO-all
	--out ERNA_noHybrid8_iqtree
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 290 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 17.00 seconds


In [73]:
vcf_noHybrid_iqtree = 'ERNA_noHybrid8_iqtree.recode.vcf'
vcf_noHybrid_iqtree_gz = vcf_noHybrid_iqtree + ".gz"
!$bgzip -c {vcf_noHybrid_iqtree} > {vcf_noHybrid_iqtree_gz}
!$tabix {vcf_noHybrid_iqtree_gz}

In [74]:
!rm 'ERNA_noHybrid8_iqtree.log'
!rm 'ERNA_noHybrid8_iqtree.recode.vcf'
!rm 'ERNA_noHybrid8_iqtree.recode.vcf.gz.tbi'

## Run iqtree

In [75]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [76]:
!mkdir iqtree

mkdir: cannot create directory ‘iqtree’: File exists


In [77]:
iqtree_dir = os.path.join(root,'iqtree')

In [79]:
!cp 'subset_vcf/ERNA_noHybrid8_iqtree.recode.vcf.gz' $iqtree_dir

In [80]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree


In [81]:
!gunzip 'ERNA_noHybrid8_iqtree.recode.vcf.gz'

In [82]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [83]:
vcf_iqtree = os.path.join(iqtree_dir,'ERNA_noHybrid8_iqtree.recode.vcf')

In [84]:
!python $vcf2phy --i $vcf_iqtree


Converting file '/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree/ERNA_noHybrid8_iqtree.recode.vcf':

Number of samples in VCF: 290
Total of genotypes processed: 23832
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 23832

Sample 1 of 290, 'EN_AH_10', added to the nucleotide matrix(ces).
Sample 2 of 290, 'EN_AH_11', added to the nucleotide matrix(ces).
Sample 3 of 290, 'EN_AH_14', added to the nucleotide matrix(ces).
Sample 4 of 290, 'EN_AH_1', added to the nucleotide matrix(ces).
Sample 5 of 290, 'EN_AH_4', added to the nucleotide matrix(ces).
Sample 6 of 290, 'EN_AH_7', added to the nucleotide matrix(ces).
Sample 7 of 290, 'EN_AH_8', added to the nucleotide matrix(ces).
Sample 8 of 290, 'EN_AH_9', added to the nucleotide matrix(ces).
Sample 9 of 290, 'EN_AR_2', added to the nucleotide matrix(ces).
Sample 10 of 290, 'EN_AR_3', added to the 

Sample 121 of 290, 'EN_EW_13', added to the nucleotide matrix(ces).
Sample 122 of 290, 'EN_EW_15', added to the nucleotide matrix(ces).
Sample 123 of 290, 'EN_EW_16', added to the nucleotide matrix(ces).
Sample 124 of 290, 'EN_EW_3', added to the nucleotide matrix(ces).
Sample 125 of 290, 'EN_EW_4', added to the nucleotide matrix(ces).
Sample 126 of 290, 'EN_EW_5', added to the nucleotide matrix(ces).
Sample 127 of 290, 'EN_EW_7', added to the nucleotide matrix(ces).
Sample 128 of 290, 'EN_FR_10', added to the nucleotide matrix(ces).
Sample 129 of 290, 'EN_FR_12', added to the nucleotide matrix(ces).
Sample 130 of 290, 'EN_FR_14', added to the nucleotide matrix(ces).
Sample 131 of 290, 'EN_FR_1', added to the nucleotide matrix(ces).
Sample 132 of 290, 'EN_FR_2', added to the nucleotide matrix(ces).
Sample 133 of 290, 'EN_FR_3', added to the nucleotide matrix(ces).
Sample 134 of 290, 'EN_FR_4', added to the nucleotide matrix(ces).
Sample 135 of 290, 'EN_FR_7', added to the nucleotide ma

Sample 246 of 290, 'EN_SC_10', added to the nucleotide matrix(ces).
Sample 247 of 290, 'EN_SC_15', added to the nucleotide matrix(ces).
Sample 248 of 290, 'EN_SC_2', added to the nucleotide matrix(ces).
Sample 249 of 290, 'EN_SC_3', added to the nucleotide matrix(ces).
Sample 250 of 290, 'EN_SC_4', added to the nucleotide matrix(ces).
Sample 251 of 290, 'EN_SC_5', added to the nucleotide matrix(ces).
Sample 252 of 290, 'EN_SC_7', added to the nucleotide matrix(ces).
Sample 253 of 290, 'EN_SC_9', added to the nucleotide matrix(ces).
Sample 254 of 290, 'EN_SJ_10', added to the nucleotide matrix(ces).
Sample 255 of 290, 'EN_SJ_1', added to the nucleotide matrix(ces).
Sample 256 of 290, 'EN_SJ_2', added to the nucleotide matrix(ces).
Sample 257 of 290, 'EN_SJ_5', added to the nucleotide matrix(ces).
Sample 258 of 290, 'EN_SJ_6', added to the nucleotide matrix(ces).
Sample 259 of 290, 'EN_SJ_7', added to the nucleotide matrix(ces).
Sample 260 of 290, 'EN_SJ_8', added to the nucleotide matri

#### NOTE: ran once but invariant sites. rerun with new .phy file

In [97]:
#phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid8_iqtree.recode.min4.phy')
phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid8_iqtree.recode.min4.phy.varsites.phy')

In [98]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [99]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_noHybrid8_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree
#SBATCH --output output_iqtree_noHybrid8.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [100]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)

# Run IQtree (no hybrids / no weird var / rarefied)

### for each pop_noHybrid, select 3 individuals and run iqtree

#### Remove: arenaria, bigelovii, graveolens, iridis, turbinata (keep nitida to see)

In [53]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [54]:
%%R
library(tidyverse)

setwd('/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf')

In [64]:
%%R 

Pop_ID_noHybrid <- read.csv('Pop_ID_noHybrid.csv')
print(table(Pop_ID_noHybrid$Variety))
print(table(Pop_ID_noHybrid$Ssp))

t <- Pop_ID_noHybrid %>%
    group_by(Pop) %>%
    summarize(Variety=unique(Variety),Ssp=unique(Ssp))

print(table(t$Variety))
print(table(t$Ssp))


   arenaria   bigelovii E.discoidea  graveolens   hololueca      iridis 
          5           9           3          11         219           6 
     nitida   oreophila salicifolia    speciosa   turbinata 
          6          64          26          25          17 

 consimilis E.discoidea    nauseosa 
         75           3         313 

   arenaria   bigelovii E.discoidea  graveolens   hololueca      iridis 
          1           2           1           2          17           1 
     nitida   oreophila salicifolia    speciosa   turbinata 
          1          11           5           4           2 

 consimilis E.discoidea    nauseosa 
         13           1          33 


In [5]:
%%R 

Pop_ID_noHybrid <- read.csv('Pop_ID_noHybrid.csv')
#table(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid$Pop <- as.character(Pop_ID_noHybrid$Pop)

pop_noHybrid <- unique(Pop_ID_noHybrid$Pop)
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid[1,]

for (p in pop_noHybrid){
    df <- Pop_ID_noHybrid[which(Pop_ID_noHybrid$Pop == p),]
    if(nrow(df) >= 3){
        index <- sample(1:nrow(df),3)
        df <- df[index,]
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }else{
        Pop_ID_noHybrid_iqtree <- rbind(Pop_ID_noHybrid_iqtree,df)
    }
}
Pop_ID_noHybrid_iqtree <- Pop_ID_noHybrid_iqtree[-1,]

In [9]:
%%R 

print(dim(Pop_ID_noHybrid_iqtree))
print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))

[1] 135  12
[1] 47


In [10]:
%%R 
# write file with individuals to keep 
keep_noHybrid_iqtree <- data.frame(INDV=as.character(Pop_ID_noHybrid_iqtree$All))
write.table(keep_noHybrid_iqtree,'keep_noHybrid_iqtree.txt',row.names=FALSE,quote=FALSE)
write.csv(Pop_ID_noHybrid_iqtree,'Pop_ID_noHybrid_iqtree.csv',row.names = FALSE)

#print(length(unique(Pop_ID_noHybrid_iqtree$Pop)))
#table(Pop_ID_noHybrid_iqtree$Pop)
#print(nrow(Pop_ID_noHybrid_iqtree))
#table(Pop_ID_noHybrid$Pop)

In [27]:
!$vcftools --gzvcf $vcf_file \
--recode \
--recode-INFO-all \
--keep 'keep_noHybrid_iqtree.txt' \
--out 'ERNA_noHybrid_iqtree'


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/subset_vcf/good_snps.recode.vcf.gz
	--keep keep_noHybrid_iqtree.txt
	--recode-INFO-all
	--out ERNA_noHybrid_iqtree
	--recode

Using zlib version: 1.2.11
Keeping individuals in 'keep' list
After filtering, kept 135 out of 588 Individuals
Outputting VCF file...
After filtering, kept 23832 out of a possible 23832 Sites
Run Time = 11.00 seconds


In [28]:
vcf_noHybrid_iqtree = 'ERNA_noHybrid_iqtree.recode.vcf'
vcf_noHybrid_iqtree_gz = vcf_noHybrid_iqtree + ".gz"
!$bgzip -c {vcf_noHybrid_iqtree} > {vcf_noHybrid_iqtree_gz}
!$tabix {vcf_noHybrid_iqtree_gz}

In [29]:
!rm 'ERNA_noHybrid_iqtree.log'
!rm 'ERNA_noHybrid_iqtree.recode.vcf'
!rm 'ERNA_noHybrid_iqtree.recode.vcf.gz.tbi'

## Run iqtree

In [30]:
cd $root

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo


In [31]:
!mkdir iqtree

mkdir: cannot create directory ‘iqtree’: File exists


In [32]:
iqtree_dir = os.path.join(root,'iqtree')

In [33]:
!cp 'subset_vcf/ERNA_noHybrid_iqtree.recode.vcf.gz' $iqtree_dir

In [34]:
cd $iqtree_dir

/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree


In [35]:
!gunzip 'ERNA_noHybrid_iqtree.recode.vcf.gz'

In [36]:
vcf2phy = '/data/gpfs/assoc/parchmanlab/tfaske/src/phylo/./vcf2phylip.py'

In [37]:
vcf_iqtree = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.vcf')

In [38]:
!python $vcf2phy --i $vcf_iqtree


Converting file '/data/gpfs/assoc/denovo/tfaske/rabbit/full/phylo/iqtree/ERNA_noHybrid_iqtree.recode.vcf':

Number of samples in VCF: 135
Total of genotypes processed: 23832
Genotypes excluded because they exceeded the amount of missing data allowed: 0
Genotypes that passed missing data filter but were excluded for being MNPs: 0
SNPs that passed the filters: 23832

Sample 1 of 135, 'EN_AH_12', added to the nucleotide matrix(ces).
Sample 2 of 135, 'EN_AH_2', added to the nucleotide matrix(ces).
Sample 3 of 135, 'EN_AH_5', added to the nucleotide matrix(ces).
Sample 4 of 135, 'EN_AR_2', added to the nucleotide matrix(ces).
Sample 5 of 135, 'EN_AR_3', added to the nucleotide matrix(ces).
Sample 6 of 135, 'EN_AR_6', added to the nucleotide matrix(ces).
Sample 7 of 135, 'EN_AS_16', added to the nucleotide matrix(ces).
Sample 8 of 135, 'EN_AS_23', added to the nucleotide matrix(ces).
Sample 9 of 135, 'EN_AS_27', added to the nucleotide matrix(ces).
Sample 10 of 135, 'EN_BC_10', added to the

Sample 124 of 135, 'EN_UT_1', added to the nucleotide matrix(ces).
Sample 125 of 135, 'EN_UT_2', added to the nucleotide matrix(ces).
Sample 126 of 135, 'EN_UT_3', added to the nucleotide matrix(ces).
Sample 127 of 135, 'EN_VM_11', added to the nucleotide matrix(ces).
Sample 128 of 135, 'EN_VM_13', added to the nucleotide matrix(ces).
Sample 129 of 135, 'EN_VM_4', added to the nucleotide matrix(ces).
Sample 130 of 135, 'EN_WA_1', added to the nucleotide matrix(ces).
Sample 131 of 135, 'EN_WA_3', added to the nucleotide matrix(ces).
Sample 132 of 135, 'EN_WA_6', added to the nucleotide matrix(ces).
Sample 133 of 135, 'EN_YL_3', added to the nucleotide matrix(ces).
Sample 134 of 135, 'EN_YL_5', added to the nucleotide matrix(ces).
Sample 135 of 135, 'EN_YL_6', added to the nucleotide matrix(ces).

PHYLIP matrix saved to: ERNA_noHybrid_iqtree.recode.min4.phy

Done!



#### NOTE: ran once but invariant sites. rerun with new .phy file

In [43]:
#phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.min4.phy')
phy_file = os.path.join(iqtree_dir,'ERNA_noHybrid_iqtree.recode.min4.phy.varsites.phy')

In [44]:
### select options for slurm submission
account = 'cpu-s5-denovo-0'
partition = 'cpu-core-0'
jobname = 'iqtree'
time = '4-00:00:00' #time limit 1 day
cpus = 16
mem_cpu = 9000
email = 'tfaske@nevada.unr.edu'

### for iqtree
model = 'GTR+ASC'
bb = 1000 #bootstrap

In [45]:
def write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy,model,bb):
    with open("run_noHybrid_iqtree.sh", "w") as o:
        o.write("""#!/usr/bin/env bash
#SBATCH --account=%s
#SBATCH --partition=%s
#SBATCH --time=%s
#SBATCH --ntasks 1
#SBATCH --cpus-per-task %d
#SBATCH --mem-per-cpu=%d
#SBATCH --job-name iqtree
#SBATCH --output output_iqtree_noHybrid.txt
#SBATCH --mail-type=END
#SBATCH --mail-user=%s

cd %s
    
iqtree -s %s -nt %d -m %s -st DNA -bb %d \n\n""" % (account,partition,time,int(cpus),int(mem_cpu),email,iqtree_dir,phy,int(cpus),model,bb))
        
        

In [46]:
write_iqtree_sh(account,partition,time,cpus,mem_cpu,email,iqtree_dir,phy_file,model,bb)

# Run iqtree.sh
    cd /data/gpfs/home/tfaske/d/rabbit/full/phylo/iqtree
    source activate py311
    sbatch run_iqtree.sh