# Host Tree Subsetting

This notebook handles generating subsets of the OTU table, such that there is only a single representative of each host species.

### Imports

In [58]:
import pandas as pd
import skbio as skb

from os.path import abspath, join
from os import makedirs
from skbio import TreeNode
from biom import Table
from qiime2 import Artifact

## Load data

### Host tree

In [6]:
tree_dir = abspath('../trees')
host_tree_fp = join(tree_dir, 'total_timetree_names.all.nwk.tre')
host_tree= skb.io.read(host_tree_fp, format='newick', 
                       into=TreeNode,
                       convert_underscores=False)

host_tips = [x.name for x in host_tree.tips()]

### Metadata

In [13]:
md_dir = '/projects/templeton/01-metadata'
host_md_fp = join(md_dir, '/projects/templeton/01-metadata/eco_md-qiime_host_species_eco_metadata_by_SampleID_gut_11.28.18.txt')
host_md = pd.read_csv(host_md_fp, sep='\t')

host_md = host_md.loc[(host_md['TimeTree_returned'].isin(host_tips))]

  interactivity=interactivity, compiler=compiler, result=result)


### OTU table

In [12]:
table_fp = '../tables/filtered/merged-table.in-map.nomito-nochloro.10k.qza'
table_art = Artifact.load(table_fp)
table = table_art.view(Table)

### Output folder


In [59]:
out_dir = '../tables/host_subsets'

makedirs(out_dir, exist_ok=True)

## Subset random sets

In [61]:
samples = 100

per_list = pd.DataFrame()

for i in range(samples):
    
    per = host_md.groupby('TimeTree_returned').apply(lambda x: x.sample(1)).reset_index(drop=True)
    per = per[['TimeTree_returned','SampleID']].set_index('TimeTree_returned')
    per.columns = [i]
    
    per_list = pd.concat([per_list, per], axis=1)

In [62]:
per_list_fp = join(out_dir, 'host_sp.1per.txt')
per_list.to_csv(per_list_fp, sep='\t')