# Calculating Beta Diversity Through Time

This notebook organizes and exports files for calculating Beta Diversity Through Time, per [Groussin, Mazel, *et al* 2017](https://www.nature.com/articles/ncomms14319). 

In [1]:
import numpy as np
import pandas as pd
import skbio as sk
from qiime2 import Artifact
from biom import load_table, Table
from skbio import TreeNode
from plotnine import *
from os.path import join, abspath
from os import makedirs
from biom.util import biom_open

from scipy.spatial.distance import squareform, pdist

## Calculate host diet distances

load host tree

In [2]:
tree_dir = abspath('../trees/')
host_tree_fp = 'total_timetree_names.all.nwk.tre'

host_tree = sk.TreeNode.read(join(tree_dir, host_tree_fp), convert_underscores=False)

In [3]:
host_tips = [x.name for x in host_tree.tips()]

load mapping file

In [4]:
md_dir = abspath('/projects/templeton/01-metadata')
host_md_fp = '5per_10k.11.28.18.short.txt'
host_md = pd.read_csv(join(md_dir, host_md_fp), sep='\t')

In [5]:
host_md.shape

(2259, 63)

Calculate diet distances

In [6]:
elton_cols = ['ET.Diet.Fruit',
             'ET.Diet.Inv',
             'ET.Diet.Nect',
             'ET.Diet.PlantO',
             'ET.Diet.Scav',
             'ET.Diet.Seed',
             'ET.Diet.Vect',
             'ET.Diet.Vend',
             'ET.Diet.Vfish',
             'ET.Diet.Vunk']

In [7]:
?host_md.groupby

In [8]:
host_diet_df = host_md.loc[host_md[elton_cols].sum(axis=1) == 100,
                           ['TimeTree_returned'] + elton_cols].dropna().groupby('TimeTree_returned').first()

In [9]:
host_diet_df.head()

Unnamed: 0_level_0,ET.Diet.Fruit,ET.Diet.Inv,ET.Diet.Nect,ET.Diet.PlantO,ET.Diet.Scav,ET.Diet.Seed,ET.Diet.Vect,ET.Diet.Vend,ET.Diet.Vfish,ET.Diet.Vunk
TimeTree_returned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accipiter_cooperii,0.0,0.0,0.0,0.0,0.0,0.0,10.0,90.0,0.0,0.0
Accipiter_striatus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
Accipiter_tachiro,0.0,10.0,0.0,0.0,0.0,0.0,20.0,70.0,0.0,0.0
Accipiter_trivirgatus,0.0,20.0,0.0,0.0,0.0,0.0,20.0,60.0,0.0,0.0
Aceros_corrugatus,90.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Filter down to hosts in our time tree

In [10]:
host_diet_df = host_diet_df.loc[host_diet_df.index.isin(host_tips),]

In [11]:
diet_dm = sk.DistanceMatrix(squareform(pdist(host_diet_df.iloc[:, :], metric='braycurtis')))
diet_dm.ids = host_diet_df.index

In [12]:
diet_dm.shape

(788, 788)

## Calculate host patristic distances

filter host tree down to those we have diets for

In [13]:
host_md.columns

Index(['SampleID', 'deblurred_seqs', 'pd_5k', 'shannon_5k', 'sample_type',
       'preservative', 'sex', 'healthy', 'captive_wild', 'LIFE_STAGE',
       'dataset', 'collector', 'country', 'Common_name', 'Species_name',
       'date_collected', 'individual_ID', 'animal_ID', 'group_individual_ID',
       'group_ID', 'Corrected_Species_name', 'flight_status', 'flight_type',
       'query_name', 'ET.BodyMass.Value', 'ET.Diet.5Cat', 'ET.Diet.Fruit',
       'ET.Diet.Inv', 'ET.Diet.Nect', 'ET.Diet.PlantO', 'ET.Diet.Scav',
       'ET.Diet.Seed', 'ET.Diet.Vect', 'ET.Diet.Vend', 'ET.Diet.Vfish',
       'ET.Diet.Vunk', 'Taxonomy_Class', 'Taxonomy_Family', 'Taxonomy_Genus',
       'Taxonomy_Kingdom', 'Taxonomy_Order', 'Taxonomy_Phylum',
       'Taxonomy_Species', 'Taxonomy_Subspecies', 'TimeTree_name',
       'eltontraits_name', 'TimeTree_returned', 'LAV.ln_mass.estimate',
       'LAV.ln_silen.estimate', 'LAV.ln_lilen.estimate',
       'LAV.ln_stomas.estimate', 'LAV.ln_caelen.estimate',
       'LA

In [14]:
hosts_diet_tree = host_tree.shear(diet_dm.ids)

In [15]:
patristic_dm = hosts_diet_tree.tip_tip_distances()

In [16]:
patristic_dm.shape

(788, 788)

## Filter OTU tables down to just these hosts

In [17]:
otu_filt_5k_fp = abspath('../tables/filtered/merged-table.in-map.nomito-nochloro.5k.qza')
otu_filt_10k_fp = abspath('../tables/filtered/merged-table.in-map.nomito-nochloro.10k.qza')

In [18]:
otu_filt_5k = Artifact.load(otu_filt_5k_fp).view(Table)
otu_filt_10k = Artifact.load(otu_filt_10k_fp).view(Table)

In [19]:
bdtt_filt_5k = otu_filt_5k.filter(host_md.loc[host_md['TimeTree_returned'].isin(diet_dm.ids),
                                              'SampleID'],
                                  inplace=False).remove_empty(axis='observation')
bdtt_filt_5k

119769 x 1979 <class 'biom.table.Table'> with 397460 nonzero entries (0% dense)

In [20]:
bdtt_filt_10k = otu_filt_10k.filter(host_md.loc[host_md['TimeTree_returned'].isin(diet_dm.ids),
                                              'SampleID'],
                                  inplace=False).remove_empty(axis='observation')
bdtt_filt_10k

148522 x 1979 <class 'biom.table.Table'> with 496939 nonzero entries (0% dense)

## Save files

In [22]:
bdtt_dir = abspath('../bdtt')

try:
    makedirs(bdtt_dir)
except FileExistsError:
    print('Dir exists!')

Dir exists!


### Save filtered tree

In [23]:
bdtt_host_tree_fp = join(bdtt_dir, 'total_timetree_names.bdtt-filt.nwk.tre')

hosts_diet_tree.write(bdtt_host_tree_fp)

'/home/jgsanders/projects/templeton/201811/bdtt/total_timetree_names.bdtt-filt.nwk.tre'

### Save diet dm

In [31]:
diet_dm_fp = join(bdtt_dir, 'diet_distances.bdtt.txt')
diet_dm.write(diet_dm_fp)

'/home/jgsanders/projects/templeton/201811/bdtt/diet_distances.bdtt.txt'

### Save patristic dm

In [32]:
patristic_dm_fp = join(bdtt_dir, 'patristic_distances.bdtt.txt')
patristic_dm.write(patristic_dm_fp)

'/home/jgsanders/projects/templeton/201811/bdtt/patristic_distances.bdtt.txt'

### Save filtered OTU tables

In [27]:
bdtt_filt_5k_fp = join(bdtt_dir, 'merged-table.in-map.nomito-nochloro.5k.bdtt.hdf5.biom')

with biom_open(bdtt_filt_5k_fp, 'w') as f:
    bdtt_filt_5k.to_hdf5(f, "5k rarefied biom filtered for bdtt")

In [29]:
bdtt_filt_10k_fp = join(bdtt_dir, 'merged-table.in-map.nomito-nochloro.10k.bdtt.hdf5.biom')

with biom_open(bdtt_filt_10k_fp, 'w') as f:
    bdtt_filt_10k.to_hdf5(f, "10k rarefied biom filtered for bdtt")

### Filter and save OTU tree

In [38]:
otus = set(bdtt_filt_5k.ids(axis='observation')).union(set(bdtt_filt_10k.ids(axis='observation')))

In [39]:
len(set(bdtt_filt_10k.ids(axis='observation')))

148522

In [40]:
len(otus)

160788

In [41]:
sepp_tree_fp = abspath('../trees/insertion-tree.qza')

sepp_tree = Artifact.load(sepp_tree_fp).view(TreeNode)

In [42]:
sepp_tree_bdtt = sepp_tree.shear(otus)

In [43]:
sepp_tree_bdtt_fp = join(bdtt_dir, 'otu.insertion-tree.bdtt.nwk.tre')
sepp_tree_bdtt.write(sepp_tree_bdtt_fp)

'/home/jgsanders/projects/templeton/201811/bdtt/otu.insertion-tree.bdtt.nwk.tre'

### Export table mapping samples to species

In [50]:
hosts = set(bdtt_filt_10k.ids(axis='sample')).union(set(bdtt_filt_5k.ids(axis='sample')))

host_table = host_md.loc[host_md['SampleID'].isin(hosts),['SampleID','TimeTree_returned']]

In [51]:
host_table_fp = join(bdtt_dir, 'SampleID_to_host.txt')

host_table.to_csv(host_table_fp, index=False)

### Tar files for sending

In [52]:
!tar -czvf ../bdtt_files.tar.gz {bdtt_dir}

tar: Removing leading `/' from member names
/home/jgsanders/projects/templeton/201811/bdtt/
/home/jgsanders/projects/templeton/201811/bdtt/merged-table.in-map.nomito-nochloro.5k.bdtt.hdf5.biom
/home/jgsanders/projects/templeton/201811/bdtt/total_timetree_names.bdtt-filt.nwk.tre
/home/jgsanders/projects/templeton/201811/bdtt/SampleID_to_host.txt
/home/jgsanders/projects/templeton/201811/bdtt/patristic_distances.bdtt.txt
/home/jgsanders/projects/templeton/201811/bdtt/diet_distances.bdtt.txt
/home/jgsanders/projects/templeton/201811/bdtt/otu.insertion-tree.bdtt.nwk.tre
/home/jgsanders/projects/templeton/201811/bdtt/merged-table.in-map.nomito-nochloro.10k.bdtt.hdf5.biom


In [57]:
abspath('../bdtt_files.tar.gz')

'/home/jgsanders/projects/templeton/201811/bdtt_files.tar.gz'

In [53]:
!ls -l ../

total 5012
drwxr-xr-x 3 jgsanders knightlab        7 Dec  5 09:03 adiv
drwxr-xr-x 4 jgsanders knightlab       10 Dec  7 07:32 bdiv
drwxr-xr-x 2 jgsanders knightlab        9 Dec 11 15:15 bdtt
-rw-r--r-- 1 jgsanders knightlab 13532329 Dec 11 15:15 bdtt_files.tar.gz
drwxr-xr-x 4 jgsanders knightlab       14 Dec 11 15:14 notebooks
drwxr-xr-x 4 jgsanders knightlab        5 Dec  7 09:32 phylosymbiosis
drwxr-xr-x 2 jgsanders knightlab        7 Dec  5 09:04 picrust
drwxr-xr-x 4 jgsanders knightlab        4 Dec  5 09:04 tables
drwxr-xr-x 2 jgsanders knightlab        7 Dec  5 09:04 taxonomy
drwxr-xr-x 2 jgsanders knightlab        7 Dec  5 09:04 trees


In [54]:
!ls -l ../bdtt/

total 29377
-rw-r--r-- 1 jgsanders knightlab  2513523 Dec 11 15:06 diet_distances.bdtt.txt
-rw-r--r-- 1 jgsanders knightlab 21278796 Dec 11 15:05 merged-table.in-map.nomito-nochloro.10k.bdtt.hdf5.biom
-rw-r--r-- 1 jgsanders knightlab 17113813 Dec 11 15:05 merged-table.in-map.nomito-nochloro.5k.bdtt.hdf5.biom
-rw-r--r-- 1 jgsanders knightlab 20000197 Dec 11 15:12 otu.insertion-tree.bdtt.nwk.tre
-rw-r--r-- 1 jgsanders knightlab  8735835 Dec 11 15:06 patristic_distances.bdtt.txt
-rw-r--r-- 1 jgsanders knightlab    76489 Dec 11 15:15 SampleID_to_host.txt
-rw-r--r-- 1 jgsanders knightlab    36256 Dec 11 15:04 total_timetree_names.bdtt-filt.nwk.tre
