#### This script: 
     - creates a dictionary that will translate OTU to taxonomy
     - compares distribution of taxa on each taxonomic level for male and female subject
     - visualises distribution of taxa on each taxonomic level separately for male and female subject through 
     time
     - saves the feature table collapsed to each of taxonomic levels

In [1]:
import pandas as pd
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

In [2]:
%cd /klaster/scratch/zuzannakarwowska/SONATA_DATASETS/550_dataset/QIITA_analysis

/klaster/scratch/zuzannakarwowska/SONATA_DATASETS/550_dataset/QIITA_analysis


In [3]:
taxonomy_file = 'taxonomy/taxonomy.tsv'
rarefied_file = 'data/feces_rarefied.tsv'
metadata_file = 'data/feces_metadata.tsv'

#read taxonomy file
taxonomy_df = pd.read_csv(taxonomy_file, sep = '\t')

#read rarefied feature table
rarefied_df = pd.read_csv(rarefied_file, sep = '\t', skiprows = [0])
rarefied_df = rarefied_df.rename(columns = {'#OTU ID': 'OTU'})
#rarefied_df = rarefied_df.set_index(['OTU'])

#read metadata file
metadata = pd.read_csv(metadata_file, sep = '\t')
metadata = metadata[(metadata['sample_type'] == 'feces') & (metadata['mislabeled'] == False)]

In [4]:
#dict  OTU:taxonomy
dictionary = dict(zip(taxonomy_df['Feature ID'].tolist(), taxonomy_df['Taxon'].tolist()))

In [5]:
#dict sample_id:sex_timepoint
metadata['timepoint'] = metadata['sex'] + '_' + metadata["days_since_experiment_start"].astype(str)

keys = metadata['sample_name'].tolist()
values = metadata["timepoint"] 

tp_dict = dict(zip(keys, values))

In [6]:
#translate feature table sample_id to sex_timestep, so it is easier to use in the future
rarefied_df = rarefied_df.T.reset_index().replace({'index': tp_dict}).T
rarefied_df.columns = rarefied_df.iloc[0]
rarefied_df = rarefied_df.drop(rarefied_df.index[0])

In [7]:
#add column with taxonomy to the feature table
rarefied_df['taxonomy'] = rarefied_df.OTU.map(dictionary)
taxonomy = rarefied_df['taxonomy'].str.split(expand=True)
taxonomy.columns = ['k', 'p', 'c', 'o', 'f', 'g', 's']

In [8]:
def translate_taxonomy(taxo_df):

    taxo_df['taxonomy'] = taxo_df.OTU.map(dictionary)
    taxonomy = taxo_df['taxonomy'].str.split(expand=True)
    try:
        
        taxonomy = taxonomy.drop([7, 8, 9, 10], axis = 1)
    except:
        pass
    
    taxonomy.columns = ['k', 'p', 'c', 'o', 'f', 'g', 's']

    taxo_df = taxo_df.drop(['taxonomy'], axis = 1)

    merged_taxonomy = taxonomy.join(taxo_df)
    mrg_tax = merged_taxonomy.drop(['OTU'], axis=1)
    mrg_tax = mrg_tax.replace({';':''}, regex=True)

    mrg_tax = mrg_tax.replace({'k__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'p__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'c__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'o__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'f__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'g__':''}, regex=True)
    mrg_tax = mrg_tax.replace({'s__':''}, regex=True)
    
    
    return mrg_tax

In [9]:
tax_df = translate_taxonomy(rarefied_df)

In [13]:
tax_df.to_csv('/klaster/scratch/zuzannakarwowska/SONATA_DATASETS/550_dataset/QIITA_analysis/taxonomyfeces_rarefied_taxonomy.tsv', index = False)

# compare and visualize taxonomy

In [None]:
#change column names to make plotting easier
tax_df.columns = [i.replace('female', 'f') for i in tax_df.columns]
tax_df.columns = [i.replace('male', 'm') for i in tax_df.columns]
tax_df = tax_df.rename(columns = {'f': 'family'})

#common timesteps to compare taxonomy
f_cols = [i.replace('f_', '') for i in tax_df.filter(like = 'f_').columns]
m_cols = [i.replace('m_', '') for i in tax_df.filter(like = 'm_').columns]


common_col = list(set(m_cols).intersection(f_cols))

m_common = ['m_' + i for i in common_col]
f_common = ['f_' + i for i in common_col]

common_cols = m_common+f_common

filtered_df = tax_df.filter(regex = 'f_|m_')[common_cols]

mrg_tax = tax_df[['k', 'p', 'c', 'o', 'family', 'g', 's']].join(filtered_df)

## Boxplots

In [None]:
def label_race (row):
    
    if 'f' in row['index']:
        return 'f'
    if 'm' in row['index']:
        return 'm'
    
def plot_taxonomy(df, var, title):
    
    #group bacteria by phylum

    phylum_df = df.filter(regex = 'f_|m|{}'.format(var)) #choose taxonomy level
    phylum_df = phylum_df.groupby(var).agg('sum')
    
    a = phylum_df.T.reset_index()

    a['subject'] = a.apply(lambda row: label_race(row), axis=1)    
    a = a.set_index(['index'])
    a = a.stack().reset_index()
    a['subject'] = a.apply(lambda row: label_race(row), axis=1)
    a.columns = ['index', var, 'value', 'subject']
    a = a[a[var] != 'subject']
    a.astype({'value': 'int32'}).dtypes
    
    plt.figure(figsize=(35, 15))

    sns.boxplot(y = 'value', x = var, hue = 'subject', data = a)
    plt.title('{} distribution among to subjects - common timesteps'.format(title), fontsize = 24)
    plt.xticks(rotation=90, fontsize = 20)

    plt.show()

In [None]:
taxa = ['k', 'p', 'c', 'o', 'family', 'g', 's']
names = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Strain']

for t, n in zip(taxa, names):
    plot_taxonomy(mrg_tax, t, n)

## Lineplots

In [None]:
#lineplot of every taxon through time for female subject
def lineplot_f_taxonomy(df, var, title):
    
    f_tax = df.filter(regex = ('f_|{}'.format(var))) #choose taxonomy level
    phylum_df = f_tax.groupby(var).agg('sum') #summ all features within taxonomy level
    
    phylum_df.columns = [i.replace('f_', '') for i in phylum_df.columns] #remove f_ in colnames
    phylum_df.columns = [int(i) for i in phylum_df.columns] 

    linedf = phylum_df.T.sort_index(axis = 0)#transpose so timesteps will be the index
    

    plt.figure(figsize=(35, 20))

    ax = sns.lineplot(data = linedf)
    plt.xticks(rotation=90, fontsize = 20)
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=14)

    plt.title('{} lineplot on female data'.format(title), fontsize = 20)
    plt.show()
    
    phylum_df.columns = ['female_' + str(i) for i in phylum_df.columns]
    
    return phylum_df


In [None]:
#lineplot of every taxon through time for male subject
def lineplot_m_taxonomy(df, var, title):
    
    m_tax = df.filter(regex = ('m_|{}'.format(var))) #choose taxonomy level
    phylum_df = m_tax.groupby(var).agg('sum') #summ all features within taxonomy level
    
    phylum_df.columns = [i.replace('m_', '') for i in phylum_df.columns] #remove f_ in colnames
    phylum_df.columns = [int(i) for i in phylum_df.columns] 

    linedf = phylum_df.T.sort_index(axis = 0)#transpose so timesteps will be the index


    plt.figure(figsize=(35, 20))

    ax = sns.lineplot(data = linedf)
    plt.xticks(rotation=90, fontsize = 20)
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=14)

    plt.title('{} lineplot on male data'.format(title), fontsize = 20)
    plt.show()
    
    phylum_df.columns = ['f_' + str(i) for i in phylum_df.columns]
    
    return phylum_df


In [None]:
for t, n in zip(taxa, names):
    df = lineplot_m_taxonomy(tax_df, t, n)
    

## save taxonomy

In [None]:
taxa = ['p', 'c', 'o', 'family', 'g', 's']
names = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Strain']

In [None]:
#save for each taxonomic level
for t, n in zip(taxa, names):
    
    name = 'f_' + n
    
    phylum_df = tax_df.filter(regex = 'f_|{}'.format(t))
    phylum_df = phylum_df.groupby(t).agg('sum')
    phylum_df = phylum_df[phylum_df.astype('bool').mean(axis=1)>=0.25]

    phylum_df.iloc[1:]
    
    phylum_df.to_csv('{}.csv'.format(name), sep = '\t')
