In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import biom
import qiime2

from scipy import stats
import warnings
import skbio
from qiime2.plugins.feature_classifier.methods import classify_sklearn
from itertools import cycle, islice

In [2]:
warnings.filterwarnings("ignore")
sns.set(style="darkgrid")

In [3]:
%cd /storage/zkarwowska/zkarwowska/microbiome-interactions/datasets/processed/

/storage/zkarwowska/zkarwowska/microbiome-interactions/datasets/processed


In [4]:
#read feature table (csv)
q2202_s1_feature_table = pd.read_csv('qiita_2202/data_processing/interpolation_rarefaction/donorA_rarefied_interpolated_feces.csv', index_col = [0])
q2202_s2_feature_table = pd.read_csv('qiita_2202/data_processing/interpolation_rarefaction/donorB_rarefied_interpolated_feces.csv', index_col = [0])
q550_m_feature_table = pd.read_csv('qiita_550//data_processing/interpolation_rarefaction/male_rarefied_interpolated_feces.csv', index_col = [0])
q550_m_feature_table = q550_m_feature_table.sort_index()
q550_f_feature_table = pd.read_csv('qiita_550//data_processing/interpolation_rarefaction/female_rarefied_interpolated_feces.csv', index_col = [0])
q550_f_feature_table = q550_f_feature_table.sort_index()

In [5]:
#read filtered sequences
q2202_s1_filtered_sequences = qiime2.Artifact.load('alpha_diversity_analysis_4_datasets/q2202_s1_filtered_sequences.qza')
q2202_s2_filtered_sequences = qiime2.Artifact.load('alpha_diversity_analysis_4_datasets/q2202_s2_filtered_sequences.qza')
q550_m_filtered_sequences= qiime2.Artifact.load('alpha_diversity_analysis_4_datasets/q550_m_filtered_sequences.qza')
q550_f_filtered_sequences = qiime2.Artifact.load('alpha_diversity_analysis_4_datasets/q550_f_filtered_sequences.qza')

In [None]:
#read classifier
classifier = qiime2.Artifact.load('gg-13-8-99-515-806-nb-classifier.qza')

In [8]:
def assign_taxonomy(sequences_artifact, df):
    
    def prepare_taxonomy_dict(sequences_artifact):
        
        classification = classify_sklearn(sequences_artifact, classifier)
        classification_df = classification.classification.view(pd.DataFrame).reset_index()
        taxonomy_dictionary = dict(zip(classification_df['Feature ID'].tolist(), classification_df['Taxon'].tolist()))
        
        return taxonomy_dictionary 
    
    taxonomy_dictionary = prepare_taxonomy_dict(sequences_artifact)
    assigned_taxonomy_df = df.T.reset_index().replace({"index": taxonomy_dictionary})
    assigned_taxonomy_df[['k', 'p', 'c', 'o', 'f', 'g', 's']] =  assigned_taxonomy_df['index'].str.split(';', expand = True)
    assigned_taxonomy_df = assigned_taxonomy_df.drop(['index'], axis = 1)

    return assigned_taxonomy_df

In [9]:
q2202_s1_taxonomy = assign_taxonomy(q2202_s1_filtered_sequences, q2202_s1_feature_table)
q2202_s2_taxonomy = assign_taxonomy(q2202_s2_filtered_sequences, q2202_s2_feature_table)

q550_m_taxonomy = assign_taxonomy(q550_m_filtered_sequences, q550_m_feature_table)
q550_f_taxonomy = assign_taxonomy(q550_f_filtered_sequences, q550_f_feature_table)

In [10]:
q2202_s1_taxonomy.to_csv('taxonomy/q2202_s1_taxonomy.csv')
q2202_s2_taxonomy.to_csv('taxonomy/q2202_s2_taxonomy.csv')
q550_m_taxonomy.to_csv('taxonomy/q550_m_taxonomy.csv')
q550_f_taxonomy.to_csv('taxonomy/q550_f_taxonomy.csv')