In [66]:
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import plotly.graph_objects as go
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from collections import defaultdict
from plotly.offline import download_plotlyjs, plot,iplot
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 320)

SAMPLE_TIME_SORTED = ['initial','twoweek','onemonth','twomonth','fourmonth','sixmonth','ninemonth','oneyear','sick']
def get_heatmap_cmap():
    """ This color map is the seismic_r cmap only blue was replace with green using matplotlib.cm.get_cmap('seismic_r',lut=100)._segmentdata

    :return:
    """
    seismic_segmentdata = {
        'red': ((0.0, 0.5, 0.5),
                (0.25, 1.0, 1.0),
                (0.5, 1.0, 1.0),
                (0.75, 0.0, 0.0),
                (1.0, 0.0, 0.0)),

        'green': ((0.0, 0.0, 0.0),
                  (0.25, 0.0, 0.0),
                  (0.5, 1.0, 1.0),
                  (0.75, 1.0, 1.0),
                  (1.0, 0.3, 0.3)),

        'blue': ((0.0, 0.0, 0.0),
                 (0.25, 0.0, 0.0),
                 (0.5, 1.0, 1.0),
                 (0.75, 0.0, 0.0),
                 (1.0, 0.0, 0.0))
    }
    sismic_green = LinearSegmentedColormap('seismic_green', seismic_segmentdata)
    return sismic_green

In [67]:

def build_trainig_data(merge_df,meta_idx,test_size=0.25,split_by_control=False):
    if split_by_control:
        # Don't know where na symtpoms should go. Drop them
        na_symptoms = merge_df[pd.isna(merge_df.symptoms)]
        merge_df = merge_df.drop(na_symptoms.index)
        control_data = merge_df[merge_df.symptoms == "Control"]
        ap_data = merge_df.drop(control_data.index)  # Everyone that is not control group

        X_train = ap_data.iloc[:, :meta_idx].values
        X_test = control_data.iloc[:, :meta_idx].values

        meta_train = ap_data.iloc[:, meta_idx:]
        y_train = meta_train.loc[:, 'visit_age_mo'].values

        meta_test = control_data.iloc[:, meta_idx:]
        y_test = meta_test.loc[:, 'visit_age_mo'].values

        control_data = control_data.groupby(['record_id']).apply(lambda x: split_is_train(x, 1 - test_size))
        X_control_train = control_data.loc[control_data.is_train].iloc[:, :meta_idx]
        y_control_train = control_data.loc[control_data.is_train].iloc[:, meta_idx:].loc[:,'visit_age_mo'].values

        X_control_test = control_data.drop(X_control_train.index).iloc[:,:meta_idx]
        y_control_test = control_data.drop(X_control_train.index).iloc[:, meta_idx:].loc[:,'visit_age_mo'].values
        
        return {"X_train": X_train, "X_test": X_test, "y_train": y_train, 'y_test':y_test,
                "X_control_train": X_control_train,
                "y_control_train": y_control_train,
                "X_control_test": X_control_test,
                "y_control_test": y_control_test,
                "control_data" : control_data,
                'ap_data'      : ap_data
                }
    else:
        ap_data = merge_df
        X = merge_df.iloc[:, :meta_idx].values
        meta = merge_df.iloc[:, meta_idx:]
        y = meta.loc[:, 'visit_age_mo'].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=666)

        return {"X_train": X_train, "X_test": X_test, "y_train": y_train, 'y_test':y_test}
    
def quantaize_age_mo(metadata:pd.DataFrame, r_factor:int = 5, print_hist=False,inplace=False) -> pd.DataFrame:
    """ Given a rounding factor, will round the decimal point of the age in month to the decimal rounoding of that number. 
    for example for r_factor 5 (default) and numbers 0.1,0.2,0.3,0.4,0.5,0.8 will return the list of 0.0,0.5,0.5,0.5,0.5,1.0,
    if print_hist is True, will print histogram of new df visit_age_mo
    """ 
    
    meta_copy = metadata.copy() if not inplace else metadata
    meta_copy.visit_age_mo = ((meta_copy.visit_age_mo/r_factor).round(1)*r_factor).round(1)
    if print_hist:
        unique_visit_mo = meta_copy.groupby('visit_age_mo')['sampleID'].nunique()
        print(unique_visit_mo)
        unique_visit_mo.hist()
    
    return meta_copy

def evaluate(model, features, labels,err_margin=0.3,verbose=True):
    predictions = model.predict(features)
    loss = abs(predictions - labels)
    succ = np.count_nonzero(loss < err_margin) / len(loss)
    
    if verbose:
        print('Model Performance')
        print('Average Error: {:0.4f} .'.format(np.mean(loss)))
        print('Accuracy = {:0.2f}%.'.format(succ))
    
    return succ

def get_random_search_parameters():
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = np.linspace(10, 100, num = 5,dtype=int).tolist() + [None]
    # Minimum number of samples required to split a node
    min_samples_split = [5, 10, 20, 40]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [2, 4, 6,10]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap,
                   'random_state': [666]}
    return random_grid

def get_linear_regressor(merged_training_df,test_map,pred_name,return_data=False):
    #     test_map = merged_training_df.tt == 'test'
    training_data = merged_training_df.iloc[:,:meta_idx].loc[test_map]
    assert len(training_data) > 0, "test map didn't return any data point"
    predict = merged_training_df[pred_name]
#     predict = best_model.predict(merged_training_df.iloc[:,:meta_idx].loc[test_map].values)
    label = merged_training_df[test_map].visit_age_mo.values
    # X = np.stack([label,predict]).T
    X = label.reshape(-1,1)
    # sort_idxs= np.argsort(label)
    y = predict[test_map]
    # Create linear regression object
    
    regr = linear_model.LinearRegression()
    regr.fit(X,y)
    if return_data:
        return regr,X
    
    return regr
    
def draw_regresssion(merged_training_df,test_map,fig,name='regression',pred_name = 'predict', color=None):
    
    regr,X = get_linear_regressor(merged_training_df,test_map,pred_name,return_data=True)
    reg_predict = regr.predict(X)
    fig.add_trace((go.Scatter(x=X[:,0], y=reg_predict,
                        mode='lines',
                        name=name,
                              marker={"color":color},
                        line=dict(width=3))))
    return fig

def draw_xy_line(fig,max_x=14,line_width=3):
    """
    Draw x=y line on figure frin x=0 to x=max_x
    """
    x = list(np.arange(0,max_x))
    fig.add_trace((go.Scatter(x=x, y=x,
                    mode='lines',
                    name='label',
                    line=dict(width=line_width))))
    return fig

def calc_regressor_control(merge_df,params,n_samples=20,err_margin=1.0,
                           print_accuracy = True,split_control=True,train_ratio=0.8):
    if split_control:
        loss_lst = [list(),list(),list()]
        acc_lst = [list(),list(),list()]
    else:
        loss_lst = [list(),list()]
        acc_lst = [list(),list()]
        
    best_features = list()
    for i in tqdm(range(n_samples)):
        data = get_tt_data(merge_df,split_control=split_control,train_ratio=train_ratio)
    #     control_train_df,control_test_df,test_df = get_tt_data()
        X_train = data[0].iloc[:,:meta_idx].values
        y_train = data[0].visit_age_mo
        best_model=  RandomForestRegressor(**params)
        best_model.fit(X_train,y_train)
            
        for i in range(len(acc_lst)):    
            X = data[i].iloc[:,:meta_idx].values
            y = data[i].visit_age_mo
            predict = best_model.predict(X)

            label = y
            loss_arr = abs(predict-label)
            succ = np.count_nonzero(loss_arr < err_margin) / len(loss_arr)
            loss =  np.mean(loss_arr)

            loss_lst[i].append(loss)
            acc_lst[i].append(succ)
        
        importances = best_model.feature_importances_
        regress_indices = np.argsort(importances)[::-1]
        best_features.append(regress_indices)
        
    if print_accuracy:
        regr_print_avg_acc(loss_lst,acc_lst,split_control)
            
    return loss_lst,acc_lst,best_features

def regr_print_avg_acc(loss_lst,acc_lst,split_control ):
    if len(loss_lst) == 3:
        df_enum = ["control_train_df","control_test_df","test_df"]
    elif len(loss_lst) == 2:
        df_enum = ["train_df","test_df"]
    for i in range(len(loss_lst)):
        print(df_enum[i])
        print(f"AVG loss = {(np.sum(loss_lst[i])/len(acc_lst[i])).round(2)}")
        print(f"AVG acc = {(np.sum(acc_lst[i])/len(acc_lst[i])).round(2)}")
        print(f"MIN acc = {np.min(acc_lst[i]).round(2)}")
        print(f"MAX acc = {np.max(acc_lst[i]).round(2)}")
        print()
        
def prepare_features_pos_score(best_features):
    features_scores_d = defaultdict(lambda : 0)
    for arr in best_features:
        for i,num in enumerate(arr):
            features_scores_d[num] += i
    return features_scores_d

# Data Preparation

We will first load the data from the csv, and merge it with the metadata to have a data frame with first `meta_idx` columns are the features columns, rest of the columns are the metadata.

In [68]:
norm_l7_path = "./data/feature-table-norm-l7.csv"
l7_path = "./data/feature-table-l7.csv"
norm_l6_path = "./data/feature-table-norm-l6.csv"
l6_path = "./data/feature-table-l6.csv"

def get_gmap_data(data_path:str):
    """
    return merge_df,meta_idx
    """
    otu_data = pd.read_csv(data_path,sep='\t',index_col=['OTU ID'])
    examples_data = otu_data.T
    metadata = pd.read_csv("./data/edited_metadata.csv",sep='\t')
    merge_df = examples_data.merge(metadata,left_index=True,right_on=['sampleID'])
    merge_df = merge_df.assign(is_control = (merge_df.symptoms == 'Control'))
    meta_idx = examples_data.shape[1]
    return merge_df,meta_idx
    

In [132]:
merge_df.shape

(988, 514)

In [69]:
merge_df,meta_idx = get_gmap_data(norm_l7_path)
merge_df.head(2)

Unnamed: 0,Unassigned;__;__;__;__;__;__,k__Archaea;__;__;__;__;__;__,k__Archaea;p__Euryarchaeota;__;__;__;__;__,k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__,k__Bacteria;__;__;__;__;__;__,k__Bacteria;p__;c__;o__;f__;g__;s__,k__Bacteria;p__AD3;c__;o__;f__;g__;s__,k__Bacteria;p__Acidobacteria;c__Acidobacteriia;o__Acidobacteriales;f__Acidobacteriaceae;g__Terriglobus;s__,k__Bacteria;p__Acidobacteria;c__Solibacteres;o__Solibacterales;f__;g__;s__,k__Bacteria;p__Actinobacteria;c__Acidimicrobiia;o__Acidimicrobiales;f__;g__;s__,k__Bacteria;p__Actinobacteria;c__Acidimicrobiia;o__Acidimicrobiales;f__AKIW874;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;__;__;__;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;__;__;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinobaculum;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces;s__europaeus,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Arcanobacterium;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Mobiluncus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__N09;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Trueperella;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Varibaculum;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Brevibacteriaceae;g__Brevibacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Brevibacteriaceae;g__Brevibacterium;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Brevibacteriaceae;g__Brevibacterium;s__paucivorans,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__durum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__kroppenstedtii,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__renale,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__simulans,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__stationis,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__variabile,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dermabacteraceae;g__Brachybacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dermabacteraceae;g__Dermabacter;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dermacoccaceae;g__Dermacoccus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dietziaceae;g__Dietzia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Geodermatophilaceae;g__Modestobacter;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Gordoniaceae;g__Gordonia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Intrasporangiaceae;__;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Intrasporangiaceae;g__Serinicoccus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Kineosporiaceae;g__Kineococcus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Microbacteriaceae;g__Leucobacter;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium;s__lacticum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Microbacteriaceae;g__Yonghaparkia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;__;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Citricoccus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Kocuria;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Micrococcus;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Nesterenkonia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;s__dentocariosa,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;s__mucilaginosa,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Mycobacteriaceae;g__Mycobacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardiaceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardiaceae;g__Rhodococcus;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardioidaceae;__;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium;s__acnes,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium;s__granulosum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Pseudonocardiaceae;g__Actinomycetospora;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Pseudonocardiaceae;g__Pseudonocardia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Rarobacteraceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Yaniellaceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Alloscardovia;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__bifidum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__longum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__pseudolongum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__thermacidophilum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Gardnerella;s__,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Scardovia;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Adlercreutzia;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Atopobium;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Atopobium;s__rimae,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__aerofaciens,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella;s__stercoris,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Eggerthella;s__,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Eggerthella;s__lenta,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Slackia;s__,k__Bacteria;p__Actinobacteria;c__Thermoleophilia;o__Gaiellales;f__Gaiellaceae;g__;s__,k__Bacteria;p__Bacteroidetes;__;__;__;__;__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;__;__;__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__;g__;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;__;__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__acidifaciens,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__caccae,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__coprophilus,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__eggerthii,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__fragilis,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__ovatus,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__plebeius,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Candidatus Azobacteroides;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Dysgonomonas;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Dysgonomonas;s__gadei,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__distasonis,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__gordonii,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Porphyromonas;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__copri,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__nanceiensis,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__pallens,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__stercorea,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__AF12;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__PW3;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;g__;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Barnesiellaceae];g__;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Odoribacteraceae];g__Butyricimonas;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Odoribacteraceae];g__Odoribacter;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Paraprevotellaceae];g__Paraprevotella;s__,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Paraprevotellaceae];g__[Prevotella];s__,k__Bacteria;p__Bacteroidetes;c__Cytophagia;o__Cytophagales;f__Cytophagaceae;g__;s__,k__Bacteria;p__Bacteroidetes;c__Cytophagia;o__Cytophagales;f__Cytophagaceae;g__Hymenobacter;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;__;__;__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Cryomorphaceae;g__;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Capnocytophaga;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__succinicans,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__[Weeksellaceae];g__;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__[Weeksellaceae];g__Chryseobacterium;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__[Weeksellaceae];g__Cloacibacterium;s__,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__[Weeksellaceae];g__Elizabethkingia;s__,k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia;o__Sphingobacteriales;f__Sphingobacteriaceae;__;__,k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia;o__Sphingobacteriales;f__Sphingobacteriaceae;g__Pedobacter;s__,k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia;o__Sphingobacteriales;f__Sphingobacteriaceae;g__Pedobacter;s__cryoconitis,k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales];f__Chitinophagaceae;g__;s__,k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales];f__Chitinophagaceae;g__Flavisolibacter;s__,k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales];f__Chitinophagaceae;g__Sediminibacterium;s__,k__Bacteria;p__Cyanobacteria;__;__;__;__;__,k__Bacteria;p__Cyanobacteria;c__4C0d-2;o__;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__4C0d-2;o__MLE1-12;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__4C0d-2;o__YS2;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__Chloroplast;o__Stramenopiles;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__Chloroplast;o__Streptophyta;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__Nostocophycideae;o__;f__;g__;s__,k__Bacteria;p__Cyanobacteria;c__Nostocophycideae;o__Nostocales;f__Scytonemataceae;g__;s__,k__Bacteria;p__Cyanobacteria;c__Oscillatoriophycideae;o__Chroococcales;f__Xenococcaceae;g__;s__,k__Bacteria;p__Cyanobacteria;c__Oscillatoriophycideae;o__Chroococcales;f__Xenococcaceae;g__Chroococcidiopsis;s__,k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Mucispirillum;s__schaedleri,k__Bacteria;p__FBP;c__;o__;f__;g__;s__,k__Bacteria;p__Firmicutes;__;__;__;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;__;__;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;__;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Bacillus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Bacillus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Bacillus;s__horikoshii,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Geobacillus;s__vulcani,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Listeriaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Paenibacillaceae;g__Paenibacillus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Paenibacillaceae;g__Paenibacillus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Paenibacillaceae;g__Paenibacillus;s__barengoltzii,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Paenibacillaceae;g__Paenibacillus;s__macerans,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Planococcaceae;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Macrococcus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;s__lugdunensis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Thermoactinomycetaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__[Exiguobacteraceae];g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__[Thermicanaceae];g__Thermicanus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Gemellales;f__Gemellaceae;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Gemellales;f__Gemellaceae;g__Gemella;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;__;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Aerococcaceae;g__Alloiococcus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Aerococcaceae;g__Facklamia;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Aerococcaceae;g__Marinilactibacillus;s__psychrotolerans,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Carnobacteriaceae;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Carnobacteriaceae;g__Granulicatella;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Vagococcus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;__;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__acidipiscis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__agilis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__brevis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__coleohominis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__delbrueckii,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__helveticus,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__iners,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__manihotivorans,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__mucosae,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__paralimentarius,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__reuteri,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__ruminis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__salivarius,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__vaginalis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__zeae,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Pediococcus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Leuconostocaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Leuconostocaceae;g__Leuconostoc;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Leuconostocaceae;g__Weissella;s__viridescens,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Lactococcus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Lactococcus;s__garvieae,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__agalactiae,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__alactolyticus,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__anginosus,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__infantis,k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__luteciae,k__Bacteria;p__Firmicutes;c__Bacilli;o__Turicibacterales;f__Turicibacteraceae;g__Turicibacter;s__,k__Bacteria;p__Firmicutes;c__Clostridia;__;__;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;__;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Christensenellaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Christensenellaceae;g__Christensenella;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__02d06;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__butyricum,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__hiranonis,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__intestinale,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__neonatale,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__perfringens,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__subterminale,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__SMB53;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Sarcina;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Anaerofustis;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Pseudoramibacter_Eubacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Anaerostipes;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Blautia;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Blautia;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Blautia;s__obeum,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Blautia;s__producta,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__catus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__eutactus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea;s__formicigenerans,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Epulopiscium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Lachnobacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Lachnospira;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Moryella;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Moryella;s__indoligenes,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Oribacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Roseburia;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Roseburia;s__faecis,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__torques,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptococcaceae;g__Peptococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptococcaceae;g__rc4-4;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__Filifactor;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__Peptostreptococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__Peptostreptococcus;s__anaerobius,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Anaerofilum;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Anaerotruncus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium;s__prausnitzii,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;s__bromii,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;s__callidus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;s__flavefaciens,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;__;__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Acidaminococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Dialister;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Megamonas;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Megasphaera;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Mitsuokella;s__multacida,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Phascolarctobacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Selenomonas;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Selenomonas;s__noxia,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__dispar,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__parvula,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Mogibacteriaceae];g__;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Mogibacteriaceae];g__Anaerovorax;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Mogibacteriaceae];g__Mogibacterium;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__1-68;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Anaerococcus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Finegoldia;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Gallicola;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Parvimonas;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Peptoniphilus;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__WAL_1855D;s__,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__ph2;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Allobaculum;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Bulleidia;s__moorei,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Bulleidia;s__p-1630-c5,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Catenibacterium;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Coprobacillus;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Coprobacillus;s__cateniformis,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Holdemania;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium];s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium];s__biforme,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium];s__dolichum,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__cc_115;s__,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__p-75-a5;s__,k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;__;__,k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__;s__,k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Cetobacterium;s__somerae,k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__,k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Leptotrichiaceae;g__Leptotrichia;s__,k__Bacteria;p__OD1;c__;o__;f__;g__;s__,k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Gemmatales;f__Isosphaeraceae;g__;s__,k__Bacteria;p__Proteobacteria;__;__;__;__;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;__;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;g__Brevundimonas;s__diminuta,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;g__Caulobacter;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;g__Mycoplana;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae;g__Phenylobacterium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__RF32;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Aurantimonadaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Beijerinckiaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;__;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Balneimonas;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Bosea;s__genosp.,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Bradyrhizobium;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Bradyrhizobium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Brucellaceae;g__Ochrobactrum;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae;g__Hyphomicrobium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__Methylobacterium;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__Methylobacterium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__Methylobacterium;s__adhaesivum,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__Methylobacterium;s__komagatae,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylocystaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__Aminobacter;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__Defluvibacter;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Rhizobiaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Rhizobiaceae;g__Agrobacterium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Amaricoccus;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Paracoccus;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Paracoccus;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Paracoccus;s__aminovorans,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Rhodobacter;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Rubellimicrobium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Roseomonas;s__mucosa,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__mitochondria;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;__;__;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;__;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Novosphingobium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingobium;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingobium;s__yanoikuyae,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;s__asaccharolytica,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;s__echinoides,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;s__wittichii,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingopyxis;s__,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingopyxis;s__alaskensis,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Alcaligenaceae;__;__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Alcaligenaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Alcaligenaceae;g__Sutterella;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Burkholderia;s__andropogonis,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Lautropia;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;__;__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Aquabacterium;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Comamonas;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Curvibacter;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Polaromonas;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Cupriavidus;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Janthinobacterium;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Janthinobacterium;s__lividum,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Ralstonia;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Eikenella;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Kingella;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria;__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria;s__cinerea,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria;s__subflava,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;__;__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Methyloversatilis;s__,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Zoogloea;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Bdellovibrionales;f__Bacteriovoracaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Bdellovibrionales;f__Bdellovibrionaceae;g__Bdellovibrio;s__bacteriovorus,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Bilophila;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Myxococcales;f__0319-6G20;g__;s__,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Myxococcales;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter;__,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter;s__,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter;s__ureolyticus,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;__;__;__;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Aeromonadales;f__Aeromonadaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Aeromonadales;f__Succinivibrionaceae;g__Succinivibrio;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Shewanellaceae;g__Shewanella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__[Chromatiaceae];g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Ectothiorhodospiraceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;__;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Brenneria;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Citrobacter;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Dickeya;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Enterobacter;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Erwinia;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Erwinia;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Erwinia;s__oleae,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Klebsiella;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Klebsiella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Klebsiella;s__oxytoca,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Morganella;s__morganii,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Plesiomonas;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Proteus;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Providencia;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Salmonella;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Salmonella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Serratia;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Serratia;s__marcescens,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Trabulsiella;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Trabulsiella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Legionellales;f__Legionellaceae;g__Legionella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;g__Halomonas;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;__;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Actinobacillus;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Actinobacillus;s__parahaemolyticus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Actinobacillus;s__porcinus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Aggregatibacter;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Aggregatibacter;s__segnis,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Bibersteinia;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;s__influenzae,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Haemophilus;s__parainfluenzae,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__guillouiae,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__johnsonii,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__lwoffii,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__rhizosphaerae,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Enhydrobacter;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Moraxella;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Psychrobacter;s__pulmonis,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;__;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__fragi,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__veronii,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__viridiflava,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Vibrio;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Sinobacteraceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__;s__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas;__,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas;s__acidaminiphila,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas;s__geniculata,k__Bacteria;p__SAR406;c__AB16;o__SSW63Au;f__;g__;s__,k__Bacteria;p__Spirochaetes;c__Spirochaetes;o__Spirochaetales;f__Spirochaetaceae;g__Treponema;s__,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Dethiosulfovibrionaceae;g__Pyramidobacter;s__piscolens,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Synergistaceae;g__Cloacibacillus;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__;f__;g__;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Acholeplasmatales;f__Acholeplasmataceae;g__Candidatus Phytoplasma;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Anaeroplasmatales;f__Anaeroplasmataceae;g__Anaeroplasma;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Mycoplasmatales;f__Mycoplasmataceae;g__Ureaplasma;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__RF39;f__;g__;s__,k__Bacteria;p__Tenericutes;c__Mollicutes;o__RsaHF231;f__;g__;s__,k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae;g__Akkermansia;s__muciniphila,k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae;g__Luteolibacter;s__,k__Bacteria;p__Verrucomicrobia;c__[Methylacidiphilae];o__;f__;g__;s__,k__Bacteria;p__Verrucomicrobia;c__[Spartobacteria];o__[Chthoniobacterales];f__[Chthoniobacteraceae];g__;s__,k__Bacteria;p__WPS-2;c__;o__;f__;g__;s__,sampleID,sample_time,record_id,visit_age_mo,diet,diet0mo,symptoms,mode_of_delivery,gender,race_final,probiotics_firstyr,antacid_ppi_firstyr,antacid_h2_firstyr,antacid_firstyr,age_ap_resolution_day,age_diag_ap_day,age_sx_onset_day,case_id,ever_formula,gestational_age,antibiotics_during_delivery,is_control
26,5.9e-05,0.0,0.0,0.0,0.000158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000158,0.0,0.0,0.0,0.0,0.000118,0.0,3.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000947,0.324323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9e-05,0.0,0.0,0.0,0.000434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005209,0.000868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.000257,0.0,0.0,0.0,0.0,0.0,0.001302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sub.110.0.initial,initial,110,0.2,Exclusively BF,Exclusively BF,Control,Vaginal,Female,Multiple Race,0.0,0,0,0.0,,,,No AP,2.0,>37WGA,Yes,True
47,3.1e-05,0.0,0.0,0.0,0.000817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000534,0.0,0.0,0.0,0.0,0.000566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.1e-05,0.0,0.00011,0.139601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000566,0.00044,0.0,0.0,0.000204,0.0,0.0,0.0,0.0,0.0,0.0,0.000471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041616,0.000864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016197,0.001587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.716117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sub.110.1.two.week,twoweek,110,0.5,Exclusively BF,Exclusively BF,Control,Vaginal,Female,Multiple Race,0.0,0,0,0.0,,,,No AP,2.0,>37WGA,Yes,True


In [64]:
# Get the number of records in the data
records_m = merge_df.groupby('record_id').agg('first')
print(records_m.shape)

(164, 513)


In [35]:
# Get a count of the sample_time  categories
st_g = merge_df.groupby(['sample_time']).agg({'sample_time':['first',"count"]}).reset_index(drop=True)
st_g.columns = st_g.columns.droplevel(0)

# Sort the sample_time so the histogram would be sorted 
st_g['time_id'] = st_g['first'].map(lambda x: SAMPLE_TIME_SORTED.index(x))
st_g = st_g.sort_values(['time_id']).reset_index(drop=True)

counts = st_g['count']
bins = st_g['time_id']
fig = go.Figure(go.Bar(x=bins, y=counts))
fig.data[0].text = counts
fig.update_traces(textposition='inside', textfont_size=12)
fig.update_layout(bargap=0)


fig.update_traces(marker_color='blue', marker_line_color='blue',
                  marker_line_width=1, opacity=0.4)

fig.show()

In [48]:
# data_df = merge_df.iloc[:,:meta_idx]
# tmp_df = data_df.iloc[26:29]
# t = tmp_df[tmp_df > 0]
# t.stack().unstack().fillna(0).T
# data_df[:,(data_df.iloc[26] < 0).values]
# merge_df[:,(merge_df.iloc[26,:meta_idx]>0).values]


Unnamed: 0,60,398,442
k__Bacteria;__;__;__;__;__;__,0.000137,0.001699,0.000382
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces;s__,0.016953,0.019638,0.000989
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Micrococcaceae;g__Rothia;s__mucilaginosa,0.000195,0.00108,0.000562
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;__,0.00041,0.217324,0.358017
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__,0.000527,0.001969,0.001371
k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;__,0.001074,0.003477,0.00519
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus;__,0.001348,0.000333,0.000831
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;__,0.000742,0.000603,0.001281
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,0.002305,0.01043,0.004201
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__alactolyticus,0.081895,0.306609,0.192411


<b> splitting to train test </b>
<br></br>
In order to get a better understading on our data, we will first try to split the data to train/test on the **control sample data** only. The rest will be used for testing.

In order to not contaminant the training data, we will split to train/test after **groupping by the record_id**

In [70]:

def split_train_test(x,split_control=True,train_ratio=0.75):
#     import ipdb;ipdb.set_trace()
    rand_val = np.random.rand()
    is_train = rand_val < train_ratio
    if not split_control:
        tt = 'train' if is_train else 'test'
        
    else:
        if not any(x.is_control):
            tt='test'
        else:
            tt = 'control_train' if is_train else 'control_test'
        
    x = x.assign(tt=tt)
    return x

def get_merged_tt_df(merge_df,split_control=True,train_ratio=0.85,inplace=False):
    """
    Add tt division for the given dataframe with ratio. group by record_id to make sure same baby won't be both in train and test.
    """
    
    tt_df =  merge_df.groupby(['is_control','record_id']).apply(lambda x: split_train_test(x,split_control,train_ratio)).reset_index(drop=True)
    
    if inplace:
        
        merge_df['tt'] = tt_df.tt.reset_index(drop=True)
    else:
        return tt_df
    
def get_tt_data(merge_df,split_control=True,train_ratio=0.85):
    """
    Return a tuple of control_train_df,control_test_df,test_df
    """
    merged_training_df = get_merged_tt_df(merge_df,split_control,train_ratio)
    test_df = merged_training_df[merged_training_df.tt == 'test']
    if split_control:
        control_train_df = merged_training_df[merged_training_df.tt == 'control_train']
        control_test_df = merged_training_df[merged_training_df.tt == 'control_test']
        return control_train_df,control_test_df,test_df
    else:
        train_df = merged_training_df[merged_training_df.tt == 'train']
        return train_df,test_df

def get_regression_predict(merge_df,best_regress_params,split_control=True,train_ratio=0.85):
    """ Train and predict on a regression model with the regress_params. 
    Add the columns predict,label,loss to the dataframe and return it.
    """
    regression_model=  RandomForestRegressor(**best_regress_params)

    data = get_tt_data(merge_df,split_control=split_control,train_ratio=0.85)
    X = data[0].iloc[:,:meta_idx].values
    y = data[0].visit_age_mo
    regression_model.fit(X,y)

    merged_training_df = pd.concat(data)
    cols = merged_training_df.columns.tolist()

    predict = regression_model.predict(merged_training_df.iloc[:,:meta_idx].values)
    label = merged_training_df.visit_age_mo
    loss_arr = abs(predict-label)
    merged_training_df = merged_training_df.assign(predict=predict, label=label, loss=loss_arr)
    return merged_training_df


# Extracting best features for age prediction model

## Regression

We will First try to train a random forest on a regression task.

**task**:
Given the bacteria samples as data, `visit_age_mo` (time in month when the sample was taken) we will try to create a regression model to predict the correct age of the sample.

<br></br>
**Model Params**:

We got the model params after running a cross validation with random search over the possible parameters.
```json
{
            'random_state': 666,
             'n_estimators': 1800,
             'min_samples_split': 5,
             'min_samples_leaf': 2,
             'max_features': 'sqrt',
             'max_depth': 100,
             'bootstrap': False}
```

<br></br>
Evaluating model - Predicting exact age in mo was too hard of a task. In order to smooth the results I added a `margin` option. The margin is the error that will be acceptable when evaluating the model (After the model was built and trained). 
For example `age_in_mo`= 1.3, `prediction=1.9`and `margin=1.0` the evaluation will be that this was a correct prediction.

In [71]:
def get_random_function(merged_training_df,n_bins=50):
    
    X = merged_training_df.iloc[:,:meta_idx].values
    label = merged_training_df.visit_age_mo

    values, bins = np.histogram(label, bins=n_bins)
    prob = values/np.sum(values)
    return lambda size: np.random.choice(bins[1:],size=size,p=prob)

best_regress_params = {
            'random_state': 666,
             'n_estimators': 1800,
             'min_samples_split': 5,
             'min_samples_leaf': 2,
             'max_features': 'sqrt',
             'max_depth': 100,
             'bootstrap': False}

### Comparing different margins

As we mentioned before, the margin is the error we we let our model to do and still count it as correct. 
Below we will test different margins and see the reslts compared to a random choice with the same margin

In [83]:

acc_test_lst = list()
acc_random_lst = list()
margin_lst = list()
merged_training_df = get_regression_predict(merge_df,best_regress_params,False,0.85)
label = merged_training_df.visit_age_mo
predict = merged_training_df.predict

test_trained_df = merged_training_df[merged_training_df.tt == 'test']
random_func = get_random_function(test_trained_df)
for i in tqdm(range(21)):
    margin = 0.25*i
    margin_lst.append(margin)
    
    label = test_trained_df.visit_age_mo
    predict = test_trained_df.predict

    #Calculate model loss and accuracy
    loss = abs(predict-label)
    test_acc = np.count_nonzero(loss < margin) / len(loss)
#     acc_control.append(train_acc[0])
    acc_test_lst.append(test_acc)
    

    choice_random =random_func(size=len(predict))
    #Calculate random loss and accuracy
    loss = abs(choice_random-label)
    acc_random = np.count_nonzero(loss < margin) / len(loss)
    acc_random_lst.append(acc_random)


acc_test_arr = np.array(acc_test_lst).round(3)
acc_random_arr = np.array(acc_random_lst).round(3)

100%|██████████| 21/21 [00:00<00:00, 384.29it/s]


In [84]:
acc_test_arr_bckp = acc_test_arr.copy()
acc_random_arr_bckp = acc_random_arr.copy()
margin_lst_bckp = margin_lst.copy()

acc_test_arr = acc_test_arr[::2]
acc_random_arr = acc_random_arr[::2]
margin_lst = margin_lst[::2]

In [95]:
acc_test_df = pd.DataFrame({"accuracy":acc_test_arr,"pred_type":'RegressionForst','margin':margin_lst})
acc_rand_df = pd.DataFrame({"accuracy":acc_random_arr,"pred_type":'random','margin':margin_lst})
acc_df = pd.concat([acc_test_df,acc_rand_df])
# acc_df['margin'] = margin_lst
# px.line(acc_df,x='margin',y='accuracy',color='pred_type')
layout = dict(plot_bgcolor='white',
              xaxis=dict(title='$\lambda\ (margin)$',
                         showgrid=True),
              yaxis=dict(title='Accuracy',
                         showgrid=True))


fig = go.Figure(data=go.Scatter(x=acc_test_df['margin'], y=acc_test_df['accuracy']
                               , text=acc_test_df['accuracy']
                               , textposition='top right'
                               , mode='lines+markers+text'
                               , name ='Regression test acc'),
                layout=layout)

fig.add_trace(go.Scatter(x=acc_rand_df['margin'], y=acc_rand_df['accuracy']
                         , text=acc_rand_df['accuracy']
                         , textposition='top right'
                         , mode='lines+markers+text'
                         , name ='Random test acc'))
# fig.data[0].text = 



#line(acc_rand_df,x='margin',y='accuracy',color='pred_type')

### Training on different data resources
We tried to train on different data sources: with/without normalization and on l6/l7

In [11]:
print("Normed l6 results")
merge_df, meta_idx = get_gmap_data(norm_l6_path)
_ = calc_regressor_control(merge_df,best_regress_params,split_control=False,train_ratio=0.8,n_samples=20)

print("Not-Normed l6 results")
merge_df, meta_idx = get_gmap_data(l6_path)
_ = calc_regressor_control(merge_df,best_regress_params,split_control=False,train_ratio=0.8,n_samples=20)

print("Normed l7 results")
merge_df, meta_idx = get_gmap_data(norm_l7_path)
_ = calc_regressor_control(merge_df,best_regress_params,split_control=False,train_ratio=0.8,n_samples=20)

print("Not-Normed l7 results")
merge_df, meta_idx = get_gmap_data(l7_path)
_ = calc_regressor_control(merge_df,best_regress_params,split_control=False,train_ratio=0.8,n_samples=20)

  0%|          | 0/20 [00:00<?, ?it/s]

Normed l6 results


100%|██████████| 20/20 [03:12<00:00,  9.65s/it]


train_df
AVG loss = 0.48
AVG acc = 0.92
MIN acc = 0.91
MAX acc = 0.92

test_df
AVG loss = 1.67
AVG acc = 0.39
MIN acc = 0.31
MAX acc = 0.44

Not-Normed l6 results


100%|██████████| 20/20 [02:30<00:00,  7.52s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

train_df
AVG loss = 0.49
AVG acc = 0.9
MIN acc = 0.89
MAX acc = 0.91

test_df
AVG loss = 1.72
AVG acc = 0.36
MIN acc = 0.32
MAX acc = 0.4

Normed l7 results


100%|██████████| 20/20 [03:05<00:00,  9.28s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

train_df
AVG loss = 0.45
AVG acc = 0.94
MIN acc = 0.93
MAX acc = 0.94

test_df
AVG loss = 1.71
AVG acc = 0.38
MIN acc = 0.32
MAX acc = 0.44

Not-Normed l7 results


100%|██████████| 20/20 [02:39<00:00,  8.00s/it]

train_df
AVG loss = 0.46
AVG acc = 0.93
MIN acc = 0.92
MAX acc = 0.94

test_df
AVG loss = 1.75
AVG acc = 0.38
MIN acc = 0.3
MAX acc = 0.45






### Most important features

The Tree regression model gives us score to the importance of different features. Below are a plot of the features (by number) and the number of times it appeared as one of the top 20 most important features.

In [143]:
loss_lst,acc_lst,best_features = calc_regressor_control(merge_df,best_regress_params,split_control=False,train_ratio=0.85,n_samples=20)

100%|██████████| 20/20 [02:02<00:00,  6.14s/it]

train_df
AVG loss = 0.45
AVG acc = 0.93
MIN acc = 0.92
MAX acc = 0.94

test_df
AVG loss = 1.63
AVG acc = 0.4
MIN acc = 0.32
MAX acc = 0.47






In [144]:
n_top = 30
best_regr_features = np.array(best_features)[:,:n_top]
clf_unique, clf_count = np.unique(best_regr_features,return_counts=True)
top_regr_repeat_features = clf_unique[clf_count>=20]
print(f"Num of features that repeated at least 20 times {top_regr_repeat_features.shape}")
df = pd.DataFrame({"clf_unique":clf_unique,'clf_count':clf_count,'above_threshold':False})
df.above_threshold[df.clf_count>=20] = True
px.bar(df, x='clf_unique',y='clf_count',color='clf_count',color_continuous_scale='Bluered_r')

Num of features that repeated at least 20 times (23,)


**Top regression features**
<br>
We will now calculate the top regression features, where we will order them according to the average position score in all the runs. where for each run, for each feature we sum the positions it appeared in, and average over the num of top features we was choosing. In the original ordering, the order is dedencing in importance, though I didn't save the actuall importance value the feature got

In [150]:
best_regr_features.shape

(20, 30)

In [164]:
pd.set_option('display.max_colwidth', None)
features_scores_d = prepare_features_pos_score(best_regr_features)
df = pd.DataFrame({"feature":merge_df.iloc[:,top_regr_repeat_features].columns.tolist(),"feature_num":top_regr_repeat_features})
df['pos_score'] = df.feature_num.map(lambda x : features_scores_d[x])
df.sort_values(['pos_score'],inplace=True)
df.pos_score = (df.pos_score/df.pos_score.max())*n_top
sorted_regress_features = df.feature_num.tolist()
df[['feature','pos_score']].reset_index(drop=True)

Unnamed: 0,feature,pos_score
0,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;__,0.0
1,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,2.743191
2,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__;s__,2.801556
3,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__;s__,3.093385
4,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__,4.494163
5,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,5.603113
6,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Eggerthella;s__lenta,7.178988
7,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,7.879377
8,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium];s__dolichum,9.105058
9,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea;s__,10.856031


**Plotting the regressions results**
<br></br>
We will draw the regression results. Below in the x-axis we have the sample `visit_age_mo`, and in the y-axis we have the model `predict` 

<br></br>
On top of the points there are regrssion lines, splitted to different test\train categories. 
Click on a label from the legend to clear from the plot

In [137]:
merged_training_df = get_regression_predict(merge_df,best_regress_params,True,0.85)
merged_training_df.loc[pd.isna(merged_training_df.symptoms),'symptoms']='unknown'
fig = px.scatter(merged_training_df,x='visit_age_mo',y='predict',color='tt',custom_data=['symptoms','sampleID'])
fig.update_traces(
    hovertemplate="<br>".join([
        "predict %{y}, label: %{x}",
        "sampleID: %{customdata[1]}",
        "symptom: %{customdata[0]}"
    ])
)
# data = fig['data']
for data in fig['data']:
    legendgroup = data['legendgroup']
    draw_regresssion(merged_training_df,merged_training_df.tt ==legendgroup,fig,f'{legendgroup} regression',color=data['marker']['color'])

draw_xy_line(fig)


### Comparing to random model

In order to know if our model is doing well we can try to compare to a random model. We will build the random model by sampling by weighted uniform over all the possible existing bins, where the weight $W_{i}$ for example ${i}$ where the value for example ${i}$ is $V_{i}$, is: $W_{i}=\frac{V_{i}}{\sum \limits _{j} ^{} {V_{j}}}$

In [50]:
merged_training_df = get_regression_predict(merge_df,best_regress_params,False,0.75)
cols = merged_training_df.columns.tolist()
err_margin = 1.0
# merged_training_df = merged_training_df.loc[merged_training_df.tt == 'test']
# X = merged_training_df.iloc[:,:meta_idx].values
label = merged_training_df.visit_age_mo
predict = merged_training_df.predict


if 'predict' in merged_training_df: 
    # Can't use wide_to_long with a 'predict' column in the original df because it will try to remove the prefix and create a predict column
    merged_training_df.drop(columns='predict',inplace=True)
    cols = merged_training_df.columns.tolist()
    

choice_random = get_random_function(merged_training_df)(len(predict))

#Calculate model loss and accuracy
loss = abs(predict-label)
succ = np.count_nonzero(loss < err_margin) / len(loss)
print(f"AVG loss = {np.average(loss)}")
print(f"ACC  = {succ}")

#Calculate random loss and accuracy
loss = abs(choice_random-label)
succ = np.count_nonzero(loss < err_margin) / len(loss)
print(f"Tree Random AVG loss = {np.average(loss)}")
print(f"Tree Random ACC  = {succ}")

merged_training_df.loc[pd.isna(merged_training_df.symptoms),'symptoms']='unknown'

merged_training_df = merged_training_df.assign(predict_model=predict, label=label, predict_random=choice_random)

#merge the columns predict_model and predict_random with a corresponding label 'pred_type' to know where a prediction came from
merged_training_df['id'] = merged_training_df.index


#Keep all the columns except for predict_model and predict_random and join them to one column 'predict'
predict_melt_df = pd.wide_to_long(merged_training_df,['predict'],i='id',j='pred_type',sep='_',suffix='\w+').reset_index()
predict_melt_df.pred_type =  predict_melt_df.pred_type + "_" + predict_melt_df.tt

# #Plot results on graph
# # fig = px.scatter(predict_melt_df[predict_melt_df.tt == 'test'],x='visit_age_mo',y='predict',color='pred_type',custom_data=['symptoms','sampleID'])
test_df = predict_melt_df[(predict_melt_df.pred_type == 'model_test') | (predict_melt_df.pred_type == 'random_test')]
fig = px.scatter(test_df,x='visit_age_mo',y='predict',color='pred_type',custom_data=['symptoms','sampleID'])
fig.update_traces(
    hovertemplate="<br>".join([
        "predict %{y}, label: %{x}",
        "sampleID: %{customdata[1]}",
        "symptom: %{customdata[0]}"
    ])
)

for data in fig['data']:
    legendgroup = data['legendgroup']
    draw_regresssion(test_df,test_df.pred_type == legendgroup,fig,f'{legendgroup} regression',color=data['marker']['color'])
    
x = list(np.arange(0,14))
fig.add_trace((go.Scatter(x=x, y=x,
                    mode='lines',
                    name='label',
                    line=dict(width=3))))
fig.update_layout(title='Test points - Forest VS random')

AVG loss = 0.6691337923171988
ACC  = 0.8423236514522822
Tree Random AVG loss = 4.093033195020746
Tree Random ACC  = 0.22510373443983403


In [158]:
merged_training_df = get_regression_predict(merge_df,best_regress_params,False,0.85)
cols = merged_training_df.columns.tolist()
err_margin = 1.0
# merged_training_df = merged_training_df.loc[merged_training_df.tt == 'test']
# X = merged_training_df.iloc[:,:meta_idx].values
label = merged_training_df.visit_age_mo
predict = merged_training_df.predict


if 'predict' in merged_training_df: 
    # Can't use wide_to_long with a 'predict' column in the original df because it will try to remove the prefix and create a predict column
    merged_training_df.drop(columns='predict',inplace=True)
    cols = merged_training_df.columns.tolist()
    

choice_random = get_random_function(merged_training_df)(len(predict))

#Calculate model loss and accuracy
loss = abs(predict-label)
succ = np.count_nonzero(loss < err_margin) / len(loss)
print(f"AVG loss = {np.average(loss)}")
print(f"ACC  = {succ}")

#Calculate random loss and accuracy
loss = abs(choice_random-label)
succ = np.count_nonzero(loss < err_margin) / len(loss)
print(f"Tree Random AVG loss = {np.average(loss)}")
print(f"Tree Random ACC  = {succ}")

merged_training_df.loc[pd.isna(merged_training_df.symptoms),'symptoms']='unknown'

merged_training_df = merged_training_df.assign(predict_model=predict, label=label, predict_random=choice_random)

#merge the columns predict_model and predict_random with a corresponding label 'pred_type' to know where a prediction came from
merged_training_df['id'] = merged_training_df.index


#Keep all the columns except for predict_model and predict_random and join them to one column 'predict'
predict_melt_df = pd.wide_to_long(merged_training_df,['predict'],i='id',j='pred_type',sep='_',suffix='\w+').reset_index()
predict_melt_df.pred_type =  predict_melt_df.pred_type + "_" + predict_melt_df.tt

# #Plot results on graph
# # fig = px.scatter(predict_melt_df[predict_melt_df.tt == 'test'],x='visit_age_mo',y='predict',color='pred_type',custom_data=['symptoms','sampleID'])
fig = px.scatter(predict_melt_df,x='visit_age_mo',y='predict',color='pred_type',custom_data=['symptoms','sampleID'])
fig.update_traces(
    hovertemplate="<br>".join([
        "predict %{y}, label: %{x}",
        "sampleID: %{customdata[1]}",
        "symptom: %{customdata[0]}"
    ])
)

for data in fig['data']:
    legendgroup = data['legendgroup']
    draw_regresssion(predict_melt_df,predict_melt_df.pred_type == legendgroup,fig,f'{legendgroup} regression',color=data['marker']['color'])
    
x = list(np.arange(0,14))
fig.add_trace((go.Scatter(x=x, y=x,
                    mode='lines',
                    name='label',
                    line=dict(width=3))))
fig.update_layout(title='Test points - Forest VS random')

AVG loss = 0.6256164291591951
ACC  = 0.8481781376518218
Tree Random AVG loss = 4.2490000000000006
Tree Random ACC  = 0.22064777327935223


### Conclusion:

Regression task is quite hard, accuracy for test was not higher than 40% with `margin=1.0`

Regression is much better than random and we can see this by the blue pattern above. The reason we get low accuracy is because though the regression line is close to true label, the margin is larger than we look for

## Classification

In [72]:
def get_avg_accuracy(clf, merge_df, meta_idx, num_runs,y = None, train_ratio=0.8, copy_df=False, verbose = True):
    """
    Returns:
        avg_train_acc, avg_test_acc
    """
    if copy_df:
        merge_df = merge_df.copy()
        
    train_acc_arr = list()
    test_acc_arr = list()
    y = 'sample_time' if y is None else y
    for_func = tqdm(range(num_runs)) if verbose else range(num_runs)
    for i in for_func:
        train_acc, test_acc = get_classifier_accuracy(clf, merge_df,meta_idx,y=y,train_ratio=train_ratio)
        
        train_acc_arr.append(train_acc)
        test_acc_arr.append(test_acc)
    
    avg_train_acc = np.average(train_acc_arr)
    avg_test_acc = np.average(test_acc_arr)
    
    if verbose:
        print(f"Train AVG accuracy {avg_train_acc}")
        print(f"Test AVG accuracy {avg_test_acc}")
        
    return avg_train_acc,avg_test_acc

def get_classifier_accuracy(clf,merge_df_tt,meta_idx,y=None,train_ratio=0.8,split_tt=True,return_all = False):
    """get Classification classifier accuracy by training on the given dataframe. 
    Args:
        split_tt: If True, will give the merge_df_tt a new tt assignment. Otherwise, assume that the DataFrame already
        has tt and uses it.
        return_all: If true, will return a dictionary with "test_pred", "train_pred", "merge_df_tt"
    Returns:
        train_acc,test_acc
    """
    y = 'sample_time' if y is None else y
    if split_tt:
        get_merged_tt_df(merge_df_tt,split_control=False, train_ratio=train_ratio,inplace=True)
    else:
        assert 'tt' in merge_df_tt, "Must pass a df with 'tt' division if split_tt is False "
    
    train_df = merge_df_tt[merge_df_tt.tt == 'train']
    test_df  = merge_df_tt[merge_df_tt.tt == 'test']

    X_train = train_df.iloc[:,:meta_idx].values
    y_train = train_df[y].values

    X_test = test_df.iloc[:,:meta_idx].values
    y_test = test_df[y].values

#     clf = RandomForestClassifier(**best_params)
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    train_pred = clf.predict(X_train)
    train_acc = accuracy_score(y_train,train_pred)
    test_acc = accuracy_score(y_test,pred)
    test_df['pred'] = pred
    train_df['pred'] = train_pred
    if return_all:
        res_dict = {
            "test_pred": pred,
            "train_pred": train_pred,
            "merge_df_tt": merge_df_tt,
            "train_df": train_df,
            "test_df": test_df
        }
        return res_dict
    
    return train_acc, test_acc

def parameter_search_classifier(X_train,y_train):
    random_grid = get_random_search_parameters()
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=3, random_state=666, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    best_params = rf_random.best_params_
    return best_params


def get_confusion_matrix(test_df,meta_idx, pred,classes_arr=None,return_count=False,label_col = 'kmeans_label', normalize = 'index', rows_as = 'label',split_tt=True,
                        res_data = None):
    """
    test_df: DataFrame in the shape of (N,c) where N is the number of samples the prediciton was made on. Needs to have columns label_col
    pred: An array of a classifier prediction over which the confusion matrix will be. Need to be in shape (N,)
    classes_arr: An Array with the classes in the classification. The order the classes will be, is the order in the output table
    normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
    Normalize by dividing all values by the sum of values.
    """
#     if classes_arr is None:
#         classes_arr = ['onemonth', 'twomonth','fourmonth', 'sixmonth', 'ninemonth' , 'oneyear']
    label_col = 'symptoms'
    if res_data is None:
        res = get_classifier_accuracy(clf, test_df, meta_idx, y=label_col,train_ratio=train_ratio, split_tt=split_tt, return_all=True)
    else:
        res = res_data
    test_pred = res['test_pred']
    test_df = res['test_df']

    # ct,cnt = get_confusion_matrix(test_df,test_df.pred,test_df[label_col].unique().tolist(), label_col=label_col, return_count=True, normalize = 'index')
    pred = test_df.pred
    classes_arr = test_df[label_col].unique().tolist() if classes_arr is None else classes_arr

    df_mapping = pd.DataFrame({label_col: classes_arr})
    sort_mapping = df_mapping.reset_index().set_index(label_col)

    pred_merge_df = test_df.copy()
    pred_merge_df['pred_names'] = pred
    pred_merge_df['label_num'] = pred_merge_df[label_col].map(sort_mapping['index'])
    #     pred_merge_df['kmeans_label'].unique(),pred_merge_df['sample_time_num'].unique()
    pred_merge_df['pred_names_num'] = pred_merge_df['pred_names'].map(sort_mapping['index'])

    label_s = pd.Series(pred_merge_df.label_num, name='label')
    pred_s = pd.Series(pred_merge_df.pred_names_num, name='pred')
    
    if rows_as == 'predict':
        ct = pd.crosstab(pred_s, label_s, normalize=normalize)
        cnt = pd.crosstab(pred_s, label_s)

        mapping_dict = df_mapping.to_dict()[label_col]
        ct = ct.rename(columns=mapping_dict, index=mapping_dict)
        cnt = cnt.rename(columns=mapping_dict, index=mapping_dict)


        # If prediction is missing values, add them here with 0 for count and success
        missing_rows = list(set(classes_arr) - set(cnt.index))
        for missing_row in missing_rows:
            empty_row = pd.Series({k:0 for k in ct.columns.tolist()},name=missing_row)
            ct = ct.append(empty_row, ignore_index=False)
            cnt = cnt.append(empty_row, ignore_index=False)

        
    elif rows_as == 'label':
        ct = pd.crosstab(label_s, pred_s, normalize=normalize)
        cnt = pd.crosstab(label_s, pred_s)
        
        mapping_dict = df_mapping.to_dict()[label_col]
        ct = ct.rename(columns=mapping_dict, index=mapping_dict)
        cnt = cnt.rename(columns=mapping_dict, index=mapping_dict)

        missing_cols = list(set(classes_arr) - set(cnt.columns))
        # If prediction is missing values, add them here with 0 for count and success
        
        for missing_col in missing_cols:
            ct = ct.assign(**{missing_col:0})
            cnt = cnt.assign(**{missing_col:0})
#             empty_row = pd.Series({k:0 for k in ct.columns.tolist()},name=missing_col)
#             ct = ct.append(empty_row, ignore_index=False)
#             cnt = cnt.append(empty_row, ignore_index=False)


    else:
        raise ValueError(f"Rows as parameter got an unknown argument. possibilities are predict \ label but got {rows_as}")
    ct = ct.loc[sorted(ct, key =classes_arr.index ),sorted(ct, key =classes_arr.index )]
    cnt = cnt.loc[sorted(cnt, key =classes_arr.index ),sorted(cnt, key =classes_arr.index )]
    return ct,cnt

def get_classification_random_function(merged_training_df,label_name='sample_time',n_bins=50):
    
    X = merged_training_df.iloc[:,:meta_idx].values
    counts = merged_training_df[label_name].value_counts()
    values = counts.values
    bins = counts.index
#     values, bins = np.histogram(label, bins=len(np.unique(label))-1)
    prob = values/np.sum(values)
#     import ipdb;ipdb.set_trace()
    return lambda size: np.random.choice(bins,size=size,p=prob)

def cat_to_bin(classes_arr):
    return lambda x: 'onemonth' if classes_arr.index(x) < 3 else 'oneyear'

def get_clustered_classification(merge_df,n_clusters,labels=None,remove_sick=True):
    assert labels is None or len(labels) >= n_clusters, "Labels doesn't match the number of wanted clusters"
    
    if remove_sick:
        merge_df = merge_df[merge_df.sample_time != 'sick']
    
    X = merge_df.visit_age_mo.to_numpy().reshape(-1,1)
    
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=666).fit(X)
    args_res = np.argsort(kmeans.cluster_centers_.reshape(-1))
    
    if labels is not None:
        labels = list(map(lambda x: labels[np.where(args_res == x)[0][0]] ,kmeans.labels_))
    else:
        labels = kmeans.labels_
    
    merge_df = merge_df.assign(kmeans_label=labels,kmean_num_label=kmeans.labels_)
    return merge_df

def get_avg_classification_random_accuracy(merge_df,n_runs:int, label_name='kmean_num_label',split_control=False,train_ratio=0.8,verbose=True):
    acc_arr = np.empty((n_runs,2))
    for i in tqdm(range(n_runs)):
        acc_arr[i] = get_classification_random_accuracy(merge_df,label_name, split_control, train_ratio, verbose=False)
    train_avg, test_avg = np.average(acc_arr,axis=0)
    
    return train_avg, test_avg
    
def get_classification_random_accuracy(merge_df,label_name='kmean_num_label',split_control=False,train_ratio=0.8,verbose=True):
    """
    merge_df = dataframe with a column `label_name` that contains the label we want to build the random statistics on
    """
    merge_df_tt = get_merged_tt_df(merge_df,split_control=False, train_ratio=0.8)
    df = merge_df_tt[merge_df_tt.tt == 'train']
    random_predict_func = get_classification_random_function(df,label_name=label_name)
    random_predict = random_predict_func(len(df))
    train_acc = accuracy_score(random_predict,df[label_name])
    
    df = merge_df_tt[merge_df_tt.tt == 'test']
    random_predict = random_predict_func(len(df))
    test_acc = accuracy_score(random_predict,df[label_name])
    if verbose:
        print("Random accuracy")
        print(f"Random train Accuracy {train_acc}")
        print(f"Random test Accuracy {test_acc}")
    return train_acc,test_acc

def plot_classification_scatter(clf, test_df, meta_idx, labels_mapping):

    
    X_test = test_df.iloc[:,:meta_idx].values
    test_pred = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)
    max_proba =np.max(pred_proba,axis=1)
    test_df['max_proba'] = max_proba
    test_df['pred'] = test_pred
# px.strip(test_df,x='kmeans_label',y='max_proba')



    test_df.pred = test_df.pred.map(lambda x : labels_mapping[x])
    test_df.kmeans_label = test_df.kmeans_label.map(lambda x : labels_mapping[x])
    fig = px.strip(test_df,x='pred',y='max_proba',color='kmeans_label',custom_data=['sampleID','symptoms','visit_age_mo'],
                  category_orders={'pred':list(labels_mapping.values())}
                  )

    fig.update_traces(
        hovertemplate="<br>".join([
            "predict %{x}, probability: %{y}",
            "sampleID: %{customdata[0]}",
            "symptom: %{customdata[1]}",
            "real age: %{customdata[2]}"
        ])
    )
    return fig

DEFAULT_ST_SORTED_ARRAY = ['onemonth', 'twomonth', 'fourmonth', 'sixmonth', 'ninemonth', 'oneyear']

### Trainig the classification model

<br></br>
After figuring out the data structure we need for training, we will want to train our data. Prior to this notebook, I ran a random features selection to find the best parameters to train the model. 
```json
{'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
#  'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}
```

<br></br>
We will first git the avg accuracy of the model over 6 categories.

In [73]:
best_clf_params = {'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
#  'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

best_params = {'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [15]:
clf = RandomForestClassifier(**best_clf_params)
no_sick_df = merge_df[merge_df.sample_time != 'sick']
get_avg_accuracy(clf,no_sick_df,meta_idx,num_runs=20,y='sample_time',train_ratio=0.8)

100%|██████████| 20/20 [01:27<00:00,  4.38s/it]

Train AVG accuracy 0.9920793712823646
Test AVG accuracy 0.4002288922062644





(0.9920793712823646, 0.4002288922062644)

### Fixing data using Clustering 
Another try was to train a classification model. The data is categorized with the column `sample_time` which divides the samples to "periods" of when the sample was taken (initial, onemonth ... ) 
<br></br>

**The issue**:
There is a category of `sick`. Subjects that came for the test because they were sick and not according to the timeline. To avoid the issues that can be caused, we will start without those examples

In [159]:
no_sick_df = merge_df[merge_df.sample_time != 'sick']
fig = px.box(no_sick_df,x='sample_time',y='visit_age_mo',points='all')
fig.update_layout(
font=dict(
        size=20,
    ))
fig

**Data Issue**:

In the plot above the x-axis are the categories and the y-axis is the actuall `visit_age_mo` of the samples. 

<br></br>
We can see that there are some overlaps between the different categories. In order to solve this will use `kmeans` algorithm to cluster points together according to the `visit_age_mo` and train the classification model on those categories.

In [23]:

n_clusters = 8
labels = merge_df.sample_time.unique()
no_sick_df = get_clustered_classification(merge_df,n_clusters = n_clusters, labels=labels,remove_sick=True)


clf = RandomForestClassifier(**best_clf_params)
acc = get_avg_accuracy(clf,no_sick_df,meta_idx,num_runs=20,y='kmeans_label',train_ratio=0.8)
print(f"Accuracy for {n_clusters} clusters = {acc}")

px.box(no_sick_df,x='kmeans_label',y='visit_age_mo',points='all')

100%|██████████| 20/20 [01:29<00:00,  4.47s/it]

Train AVG accuracy 0.9762786111366191
Test AVG accuracy 0.43605968149536906
Accuracy for 8 = (0.9762786111366191, 0.43605968149536906)





#### Merging first categories
We will merge the first 3 categories together 


##### 6 Categories

In [102]:
n_clusters = 6
labels = ['onemonth', 'twomonth', 'fourmonth', 'sixmonth', 'ninemonth',
       'oneyear']
no_sick_df = get_clustered_classification(merge_df,n_clusters = n_clusters, labels=labels,remove_sick=False)

best_clf_params = {'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

clf = RandomForestClassifier(**best_clf_params)
acc = get_avg_accuracy(clf,no_sick_df,meta_idx,num_runs=1,y='kmeans_label',train_ratio=0.75)
print(f"Accuracy for {n_clusters} clusters = {acc}")

print()
get_classification_random_accuracy(no_sick_df)


print("\n",no_sick_df.kmeans_label.value_counts())
fig = px.box(no_sick_df,x='kmeans_label',y='visit_age_mo',points='all')
fig.update_layout(
font=dict(
        size=20,
    ))
fig

100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Train AVG accuracy 0.971900826446281
Test AVG accuracy 0.46710526315789475
Accuracy for 6 clusters = (0.971900826446281, 0.46710526315789475)

Random accuracy
Random train Accuracy 0.16802168021680217
Random test Accuracy 0.2345132743362832

 onemonth     315
twomonth     171
ninemonth    130
fourmonth    130
sixmonth     125
oneyear       93
Name: kmeans_label, dtype: int64


In [107]:
merge_df[merge_df.symptoms!='Control'].shape, merge_df.shape

((542, 516), (964, 516))

In [76]:
num_labels = ['0-1','2','4','6','9','12']
labels_mapping = {k:f"{k}({num_labels[i]})" for i,k in enumerate(labels)}
test_df = no_sick_df[no_sick_df.tt == 'test']
# test_df['orig_label'] = test_df.kmeans_label.copy()
fig = plot_classification_scatter(clf, test_df,meta_idx,labels_mapping)
print("Test pred values\n", test_df.pred.value_counts())
print("\nTest labels values\n", test_df.kmeans_label.value_counts())
fig

Test pred values
 onemonth(0-1)    125
ninemonth(9)      49
twomonth(2)       44
oneyear(12)       24
sixmonth(6)       17
fourmonth(4)      13
Name: pred, dtype: int64

Test labels values
 onemonth(0-1)    76
sixmonth(6)      54
fourmonth(4)     43
twomonth(2)      43
ninemonth(9)     36
oneyear(12)      20
Name: kmeans_label, dtype: int64


In [79]:
#Create a confusion matrix
# test_df.kmeans_label = test_df.orig_label.copy()
new_labels = list(labels_mapping.values())
ct,cnt = get_confusion_matrix(test_df,meta_idx, test_df.pred,new_labels,return_count=True, normalize = 'index')

label_mapping_r = {v:k for k,v in labels_mapping.items()}
ct = ct.rename(label_mapping_r,axis=0)
ct = ct.rename(label_mapping_r,axis=1)

cnt = cnt.rename(label_mapping_r,axis=0)
cnt = cnt.rename(label_mapping_r,axis=1)

c = 'brwnyl'
ct = ct.round(3)
fig = ff.create_annotated_heatmap(ct.to_numpy().T,x=ct.columns.tolist(),y=ct.columns.tolist(),colorscale=c)
fig.update_layout(title_text='Confusion Table - Label(rows)/Pred(cols)',font=dict(size=18))
fig['layout']['xaxis']['side'] = 'bottom'
fig

In [81]:
c = 'brwnyl'
fig = ff.create_annotated_heatmap(cnt.to_numpy().T,x=cnt.columns.tolist(),y=cnt.columns.tolist(),colorscale=c)
fig.update_layout(title_text='Confusion Table - Label(rows)/Pred(cols)', font=dict(size=18))
fig['layout']['xaxis']['side'] = 'bottom'
fig

##### 3 categories

In [82]:
n_clusters = 3
# labels = merge_df.sample_time.unique()[2:]
labels = ['initial','middle','last']
no_sick_df = get_clustered_classification(merge_df,n_clusters = n_clusters, labels=labels,remove_sick=False)


clf = RandomForestClassifier(**best_clf_params)
acc = get_avg_accuracy(clf,no_sick_df,meta_idx,num_runs=20,y='kmeans_label',train_ratio=0.7,copy_df=False)
print(f"Accuracy for {n_clusters} clusters = {acc}")

print()
get_classification_random_accuracy(no_sick_df)


print("\n",no_sick_df.kmeans_label.value_counts())
px.box(no_sick_df,x='kmeans_label',y='visit_age_mo',points='all')

100%|██████████| 20/20 [00:54<00:00,  2.72s/it]


Train AVG accuracy 0.9844584689841838
Test AVG accuracy 0.7710455301166569
Accuracy for 3 clusters = (0.9844584689841838, 0.7710455301166569)

Random accuracy
Random train Accuracy 0.38996138996138996
Random test Accuracy 0.35071090047393366

 initial    499
middle     261
last       228
Name: kmeans_label, dtype: int64


In [83]:
num_labels = ['0-2','3-7','8-14']
labels_mapping = {k:f"{k}({num_labels[i]})" for i,k in enumerate(labels)}
test_df = no_sick_df[no_sick_df.tt == 'test']
fig = plot_classification_scatter(test_df,meta_idx,labels_mapping)
print("Test pred values\n", test_df.pred.value_counts())
print("\nTest labels values\n", test_df.kmeans_label.value_counts())
fig

Test pred values
 initial(0-2)    155
last(8-14)       76
middle(3-7)      48
Name: pred, dtype: int64

Test labels values
 initial(0-2)    130
middle(3-7)      76
last(8-14)       73
Name: kmeans_label, dtype: int64


In [84]:
#Create a confusion matrix
# test_df.kmeans_label = test_df.orig_label.copy()
new_labels = list(labels_mapping.values())
ct,cnt = get_confusion_matrix(test_df,meta_idx, test_df.pred,new_labels,return_count=True, normalize = 'index')

label_mapping_r = {v:k for k,v in labels_mapping.items()}
ct = ct.rename(label_mapping_r,axis=0)
ct = ct.rename(label_mapping_r,axis=1)

cnt = cnt.rename(label_mapping_r,axis=0)
cnt = cnt.rename(label_mapping_r,axis=1)

c = 'brwnyl'
ct = ct.round(3)
fig = ff.create_annotated_heatmap(ct.to_numpy().T,x=ct.columns.tolist(),y=ct.columns.tolist(),colorscale=c)
fig.update_layout(title_text='Confusion Table - Label(rows)/Pred(cols)',font=dict(size=18))
fig['layout']['xaxis']['side'] = 'bottom'
fig

In [85]:
c = 'brwnyl'
fig = ff.create_annotated_heatmap(cnt.to_numpy().T,x=cnt.columns.tolist(),y=cnt.columns.tolist(),colorscale=c)
fig.update_layout(title_text='Confusion Table - Label(rows)/Pred(cols)', font=dict(size=18))
fig['layout']['xaxis']['side'] = 'bottom'
fig

### Taking Best Features
We can see that the average accuracy is about 54% 
We will now check accuracy and get best features for different `k` (different num of categories)

**Taking best features from k=3 gruops**

In [113]:
best_params = {'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

n_clusters = 3
# labels = merge_df.sample_time.unique()[2:]
labels = ['initial','middle','last']
no_sick_df = get_clustered_classification(merge_df,n_clusters = n_clusters, labels=labels,remove_sick=False)
clf = RandomForestClassifier(**best_clf_params)

n_iters = 20
n_top = 30
acc_list = np.zeros((n_iters,2))
best_ind_list = np.empty((n_iters,n_top))
i = 0
for i in range(n_iters):
    acc_list[i] = get_avg_accuracy(clf,no_sick_df,meta_idx,num_runs=1,y='kmeans_label',train_ratio=0.7,copy_df=False, verbose = False)
#     get_classifier_accuracy(clf,no_sick_df,meta_idx,y='kmeans_label',train_ratio=0.8)
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    best_ind_list[i] = indices[:n_top]


100%|██████████| 1/1 [00:03<00:00,  3.16s/it]
100%|██████████| 1/1 [00:02<00:00,  2.83s/it]
100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
100%|██████████| 1/1 [00:02<00:00,  2.88s/it]
100%|██████████| 1/1 [00:02<00:00,  2.99s/it]
100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
100%|██████████| 1/1 [00:02<00:00,  2.67s/it]
100%|██████████| 1/1 [00:02<00:00,  2.64s/it]
100%|██████████| 1/1 [00:02<00:00,  2.88s/it]
100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
100%|██████████| 1/1 [00:02<00:00,  2.79s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:02<00:00,  2.78s/it]
100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
100%|██████████| 1/1 [00:02<00:00,  2.86s/it]
100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
100%|██████████| 1/1 [00:02<00:00,  2.65s/it]
100%|██████████| 1/1 [00:02<00:00,  2.64s/it]


In [151]:
best_ind_list.shape

(20, 30)

In [126]:
# best_regr_features = np.array(best_features)[:,:n_top]
# best_regr_features = best_ind_list
clf_unique, clf_count = np.unique(best_ind_list,return_counts=True)
top_cls_repeat_features = clf_unique[clf_count>=20]
print(f"Num of features that repeated at least 20 times {top_cls_repeat_features.shape}")
df = pd.DataFrame({"clf_unique":clf_unique,'clf_count':clf_count,'above_threshold':False})
df.above_threshold[df.clf_count>=20] = True
px.bar(df, x='clf_unique',y='clf_count',color='clf_count',color_continuous_scale='Bluered_r')
# cls_ind_ndarr = np.array(best_ind_list)
# cls_unique, cls_count = np.unique(cls_ind_ndarr,return_counts=True)
# px.bar(x=cls_unique,y=cls_count)

Num of features that repeated at least 20 times (18,)


In [136]:
clf_unique

array([  0.,   4.,  17.,  28.,  55.,  69.,  71.,  85., 169., 181., 183.,
       204., 212., 220., 226., 232., 233., 240., 243., 245., 246., 249.,
       253., 257., 259., 260., 264., 269., 273., 274., 276., 285., 291.,
       304., 310., 314., 320., 377., 409., 420., 429., 453., 487.])

In [165]:
pd.set_option('display.max_colwidth', None)
features_scores_d = prepare_features_pos_score(best_ind_list)
df = pd.DataFrame({"feature":no_sick_df.iloc[:,top_cls_repeat_features].columns.tolist(),"feature_num":top_cls_repeat_features})
df['pos_score'] = df.feature_num.map(lambda x : features_scores_d[x])
df.sort_values(['pos_score'],inplace=True)
df.pos_score = (df.pos_score/df.pos_score.max())*n_top
sorted_cls_features = df.feature_num.tolist()
df[['feature','pos_score']].reset_index(drop=True)

Unnamed: 0,feature,pos_score
0,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;__,0.0
1,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae;g__;s__,2.383178
2,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;__,4.906542
3,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__;s__,5.607477
4,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,5.67757
5,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,5.88785
6,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus;s__,8.971963
7,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Eggerthella;s__lenta,9.953271
8,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium];s__dolichum,11.14486
9,k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,11.845794


### Evaluating the Classification model

In order to Evaluate our model, we will create a confusion matrix, where the rows are the label and columns are the predictions

In [25]:
classes_arr = DEFAULT_ST_SORTED_ARRAY
no_sick_df = merge_df[merge_df.sample_time != 'sick']
merge_df_tt = get_merged_tt_df(no_sick_df,split_control=False,train_ratio=0.8)

# Get new labels
X = merge_df_tt.visit_age_mo.to_numpy().reshape(-1,1)
labels = merge_df_tt.sample_time.unique()
kmeans = KMeans(n_clusters=6, random_state=666).fit(X)
args_res = np.argsort(kmeans.cluster_centers_.reshape(-1))
labels = list(map(lambda x: classes_arr[np.where(args_res == x)[0][0]] ,kmeans.labels_))
# labels = list(map(lambda x: np.where(args_res == x)[0][0] ,kmeans.labels_))
merge_df_tt = merge_df_tt.assign(kmeans_label=labels)

clf = RandomForestClassifier(**best_clf_params)
# pred_merge_df = merge_df_tt.loc[merge_df_tt.tt == 'test'].copy()
train_df = merge_df_tt[merge_df_tt.tt == 'train']
X = train_df.iloc[:,:meta_idx].values
y = train_df.kmeans_label.values
clf.fit(X,y)

test_df = merge_df_tt[merge_df_tt.tt == 'test']
X_test = test_df.iloc[:,:meta_idx].values
y_test = test_df.kmeans_label.values
pred = clf.predict(X_test)
N_classes = len(clf.classes_)


print(f"Got test accuracy {accuracy_score(y_test,pred)}")






Got test accuracy 0.49640287769784175


**Split to 2 categories**

We will now split to 2 categories. We can see below that kmeans split the data with median of onemonths for first category and median of 9month for second category. the seperation is about 6 month so this will be our thershold

In [33]:
no_sick_df = merge_df[merge_df.sample_time != 'sick']
X = no_sick_df.visit_age_mo.to_numpy().reshape(-1,1)

labels = no_sick_df.sample_time.unique()
kmeans = KMeans(n_clusters=3, random_state=666).fit(X)
args_res = np.argsort(kmeans.cluster_centers_.reshape(-1))
labels = list(map(lambda x: np.where(args_res == x)[0][0] ,kmeans.labels_))
# labels = list(map(lambda x: np.where(args_res == x)[0][0] ,kmeans.labels_))
no_sick_df = no_sick_df.assign(kmeans_label=labels)

px.box(no_sick_df,x='kmeans_label',y='visit_age_mo',points='all')

In [48]:
two_cls_pred = list(map(cat_to_bin(classes_arr), pred))
two_cls_y_test = list(map(cat_to_bin(classes_arr), y_test))
print(f"Got test accuracy {accuracy_score(two_cls_y_test, two_cls_pred)}")

# test_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test) #type: pd.DataFrame
pred_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test) #type: pd.DataFrame
t = pred_df.melt(id_vars=['bin_label'],value_vars=['pred'],value_name='predict')
tbl = t.groupby(['bin_label','predict'])['variable'].agg('count').unstack().fillna(0)
col_map = {n: f'{n} ({cat_to_bin(classes_arr)(n)})' for n in tbl.columns}
tbl = tbl.reindex(DEFAULT_ST_SORTED_ARRAY,axis=1)
tbl = tbl.rename(columns=col_map)

print(tbl.div(tbl.sum(axis=1), axis=0))
print("\n\n")
print(tbl.astype(int))

Got test accuracy 0.8776978417266187
predict    onemonth (onemonth)  twomonth (onemonth)  fourmonth (onemonth)  sixmonth (oneyear)  ninemonth (oneyear)  oneyear (oneyear)
bin_label                                                                                                                            
onemonth              0.808989             0.044944              0.089888            0.011236             0.044944                0.0
oneyear               0.080000             0.060000              0.100000            0.100000             0.360000                0.3



predict    onemonth (onemonth)  twomonth (onemonth)  fourmonth (onemonth)  sixmonth (oneyear)  ninemonth (oneyear)  oneyear (oneyear)
bin_label                                                                                                                            
onemonth                    72                    4                     8                   1                    4                  0
oneyear               

## Results

In [258]:
best_features = set(top_regr_repeat_features) & set(top_clf_repeat_features)
print(len(best_features),best_features)

15 {420, 260, 69, 232, 169, 264, 240, 273, 274, 304, 85, 246, 249, 314, 253}


In [526]:
best_features = top_clf_repeat_features

array([ 17,  28,  55,  69,  85, 169, 204, 212, 220, 226, 232, 233, 240,
       243, 246, 249, 253, 260, 264, 269, 273, 274, 276, 291, 304, 314,
       409, 420, 453])

We have found 15 features that appear in top 20 features for at least 50% the models in the regression and classifications models
Though the regression model was far inferior to the classification model (in regards to the accuracy we got), it looks like we got a lot of the same features to help us in the tasks.

From the classification confusion matrix we can see some interesting things:
<li> The "edges" was better predicted. onemonth and oneyear got 61% predictions while 2-9 month got between 30-45%
<li> While there are some errors with big margins, most errors are still close to the prediction. For example for label twomonth the prediction are equaly divided between 2-6 month, but no prediction on onemonth,ninemonth and oneyear.
    

# AP Case prediction

In [74]:
def get_cls_prediction(clf, df, meta_idx, y_label, tt='test'):
    test_df = df[df.tt == tt]
# test_df['orig_label'] = test_df.kmeans_label.copy()

    X_test = test_df.iloc[:, :meta_idx].values
    y_test = test_df[y_label].values
    test_pred = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)
    max_proba = np.max(pred_proba, axis=1)
    test_df['max_proba'] = max_proba
    test_df['pred'] = test_pred
#     test_df['y'] = y_test
#     test_df['X'] = X_test
    return test_df, X_test, y_test

def get_avg_tables(n_runs,merge_df, filtered_merge_df, meta_idx, filtered_meta_idx, best_params, filtered_params, *args,**kwargs):
    filtered_merge_df, merge_df, clf, filtered_clf = calc_is_sick(merge_df, filtered_merge_df,meta_idx, filtered_meta_idx, best_params, filtered_params, verbose=False)
    
    precentage_list = list()
    count_list = list()
    filt_precentage_list = list()
    filt_count_list = list()
    for i in range(n_runs):
        precentage_np, count_np = get_labeled_predicion(clf=clf, merge_df=merge_df,meta_idx=meta_idx, **kwargs, return_tables=True, verbose=False)
        precentage_list.append(precentage_np)
        count_list.append(count_np)
        
        precentage_np, count_np = get_labeled_predicion(clf=filtered_clf, merge_df=filtered_merge_df, meta_idx=filtered_meta_idx, **kwargs, return_tables=True, verbose=False)
        filt_precentage_list.append(precentage_np)
        filt_count_list.append(count_np)
        
    precentage_avg = np.average(np.stack(precentage_list),axis=0)
    count_avg = np.average(np.stack(count_list),axis=0)
    
    filt_precentage_avg = np.average(np.stack(filt_precentage_list),axis=0)
    filt_count_avg = np.average(np.stack(filt_count_list),axis=0)
    return precentage_avg, count_avg, filt_precentage_avg, filt_count_avg
        
def get_labeled_predicion(clf, merge_df, meta_idx, y_label, pivot_label, tt='test', norm_axis=1, round_n=3, name='', return_tables=False, verbose=True,sorted_index=None):
    
    test_df, X_test, y_test = get_cls_prediction(
        clf, merge_df, meta_idx, y_label=y_label, tt=tt)
    pred = test_df.pred.values
    label = test_df[pivot_label]

    two_cls_pred = [str(p) for p in pred]
    two_cls_y_test = [str(p) for p in label]
#     print(f"Got test accuracy {accuracy_score(two_cls_y_test, two_cls_pred)}")

    # test_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test) #type: pd.DataFrame
    pred_df = pd.DataFrame().assign(pred=pred, bin_pred=two_cls_pred,
                                    bin_label=two_cls_y_test, label=y_test)  # type: pd.DataFrame
    t = pred_df.melt(id_vars=['bin_label', 'label'],
                     value_vars=['pred'], value_name='predict')
    tbl = t.groupby(['bin_label', 'predict'])[
        'variable'].agg('count').unstack().fillna(0)

    percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1-norm_axis)
    
    #dviri
    if sorted_index is not None:
        percentage_tbl.sort_index(key=lambda x: [sorted_index.index(val) for val in x],inplace=True)
        tbl.sort_index(key=lambda x: [sorted_index.index(val) for val in x],inplace=True)
    
    precentage_np = percentage_tbl.to_numpy().round(round_n)
    count_np = tbl.to_numpy().round(round_n)
    
    if not verbose: 
        if return_tables:
            return precentage_np, count_np
        else:
            return test_df
        
    print(percentage_tbl)
    print("\n\n")
    print(tbl.astype(int))

    c = 'brwnyl'
    x = tbl.columns.astype(str).tolist()
    y = tbl.index.astype(str).tolist()
    fig_base = make_subplots(rows=2, cols=1)
    
    
    fig_percent = ff.create_annotated_heatmap(
        precentage_np, x=x, y=y, colorscale=c, zmin=0.5, zmax=1.0)
    fig_percent['layout']['xaxis']['side'] = 'top'
    fig_base.append_trace(fig_percent['data'][0], 1, 1)

    fig_cnt = ff.create_annotated_heatmap(precentage_np, annotation_text=count_np, x=x,
                                          y=y, colorscale=fig_percent.data[0]['colorscale'], zmin=0.5, zmax=1, zauto=False)

    fig_cnt['layout']['xaxis']['side'] = 'top'
    fig_base.append_trace(fig_cnt['data'][0], 2, 1)

    # Add annotation to the plot
    annot1 = list(fig_percent.layout.annotations)
    annot2 = list(fig_cnt.layout.annotations)
    for k in range(len(annot2)):
        annot2[k]['xref'] = 'x2'
        annot2[k]['yref'] = 'y2'

    fig_base.update_layout(annotations=annot1+annot2)
    fig_base['layout']['xaxis']['side'] = 'top'
    fig_base.update_layout(
        title_text=f'{name} Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))
    fig_base.show()

    return test_df


def get_classifications_params():
    
    filtered_params = {'random_state': 666,
                   'n_estimators': 1200,
                   'min_samples_split': 5,
                   'min_samples_leaf': 10,
                   'max_features': 'auto',
                   'max_depth': None,
                   'bootstrap': False}

    best_params = {'random_state': 666,
               'n_estimators': 1200,
               'min_samples_split': 20,
               'min_samples_leaf': 2,
               'max_features': 'sqrt',
               'max_depth': None,
               'bootstrap': False}
    return filtered_params, best_params

best_features = [420, 260, 69, 264, 169, 269, 240, 304, 274, 17, 85, 246, 249, 314, 220, 253]
def get_filtered_data(data_path, best_features):
    norm_merge_df,norm_meta_idx = get_gmap_data(data_path)
    
    # best_features = [420, 260, 69, 232, 169, 264, 240, 273, 274, 304, 85, 246, 249, 314, 253]
    # best_features = [ 17,  28,  55,  69,  85, 169, 204, 212, 220, 226, 232, 233, 240,243, 246, 249, 253, 260, 264, 269, 273, 274, 276, 291, 304, 314,409, 420, 453]
    filtered_meta_idx = len(best_features)

    # best_f_df = norm_merge_df.iloc[:,best_features].reset_index(drop=True)
    best_f_df = merge_df.iloc[:,best_features].reset_index(drop=True)
    best_f_df = best_f_df.div(best_f_df.sum(axis=1),axis=0).fillna(0)  # normalize to 1 
    # best_f_df = best_f_df.loc[~(best_f_df.sum(axis=1) == 0)]
    # filtered_merge_df = filtered_merge_df.fillna(0)

    meta_df = norm_merge_df.iloc[:,norm_meta_idx:].reset_index(drop=True)
    filtered_merge_df = pd.concat([best_f_df,meta_df],axis=1)
    return filtered_merge_df, filtered_meta_idx

filtered_merge_df, filtered_meta_idx = get_filtered_data(norm_l7_path, best_features)
filtered_params, best_params = get_classifications_params()

In [None]:
# norm_merge_df,norm_meta_idx = get_gmap_data(l7_path)
norm_merge_df, norm_meta_idx = get_gmap_data(norm_l7_path)

In [18]:
best_features = [int(n) for n in (set(sorted_cls_features) & set(sorted_regress_features))]
print(best_features)

NameError: name 'sorted_cls_features' is not defined

## SVM
**SVM**

In [None]:
from sklearn import svm
svm_clf = svm.SVC(C=1.0,kernel='poly',degree=3)
# clean_df = filtered_merge_df[~filtered_merge_df.symptoms.isna()]
clean_df = merge_df[~merge_df.symptoms.isna()]
X = clean_df.iloc[:,:filtered_meta_idx].values
y = clean_df.symptoms.values

# X_test = clean_df.iloc[:,:filtered_meta_idx].values
# y = clean_df.symptoms.values
svm_clf.fit(X,y)

svm_predict = svm_clf.predict(X)
accuracy_score(y,svm_predict)

## classification on other classes

### is_ap

Predict if record_id is ap_case or not (will be symptomatic)

In [43]:
filtered_params['class_weight'] = {True:1, False:1}
best_params['class_weight'] = {True:1, False:1}

num_runs = 1
train_ratio = 0.7
merge_df_tt = get_merged_tt_df(merge_df,split_control=False,train_ratio=train_ratio)
filtered_merge_df = filtered_merge_df.assign(tt=merge_df_tt.tt)

filtered_clf = RandomForestClassifier(**best_params)
filtered_merge_df = filtered_merge_df.loc[~pd.isna(filtered_merge_df.symptoms)]
filtered_merge_df = filtered_merge_df.groupby('record_id').apply(lambda x: x.assign(is_ap= (x.symptoms == 'Symptomatic').any()).reset_index(drop=True)).reset_index(drop=True)

f_is_ap_acc = get_avg_accuracy(filtered_clf,filtered_merge_df,filtered_meta_idx,num_runs=num_runs,y='is_ap',train_ratio=train_ratio)
# f_is_ap_acc = get_classifier_accuracy(clf,filtered_merge_df,filtered_meta_idx,y='is_ap',train_ratio=0.8,split_tt=False)
print(f_is_ap_acc)

merge_df_tt = merge_df_tt.groupby('record_id').apply(lambda x: x.assign(is_ap= (x.symptoms == 'Symptomatic').any()).reset_index(drop=True)).reset_index(drop=True)
clf = RandomForestClassifier(**best_params)
is_ap_acc = get_avg_accuracy(clf,merge_df_tt,meta_idx,num_runs=num_runs,y='is_ap',train_ratio=train_ratio)
# is_ap_acc = get_classifier_accuracy(clf,merge_df_tt,meta_idx,y='is_ap',split_tt=False)
print(is_ap_acc)

print("\nRandom Accuracy")
filtered_merge_df['is_ap_str'] = filtered_merge_df.is_ap.astype(str)
get_avg_classification_random_accuracy(filtered_merge_df,num_runs, label_name='is_ap_str')


  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [126]:

test_df = filtered_merge_df[filtered_merge_df.tt == 'test']

test_pred = filtered_clf.predict(X_test)
pred_proba = filtered_clf.predict_proba(X_test)
max_proba = np.max(pred_proba, axis=1)
test_df['max_proba'] = max_proba
test_df['pred'] = test_pred

fig = px.strip(test_df, x='pred', y='max_proba', color='symptoms',
               custom_data=['sampleID', 'symptoms', 'visit_age_mo'],
               category_orders={'pred': list(labels_mapping.values())}
               )

fig.update_traces(
    hovertemplate="<br>".join([
        "predict %{x}, probability: %{y}",
        "sampleID: %{customdata[0]}",
        "symptom: %{customdata[1]}",
        "real age: %{customdata[2]}"
    ])
)
    
fig

In [None]:
test_df

### is_sick

Try to predict if current sample is symptomatic or not. There are about 22% symptomatic samples, so anything below around 80% without giving higher rate to "is_sick" is probably a model that will alsmost always say "not symptomatic"

In [75]:
lbl = ['Control', 'Pre-symptoms', 'Symptomatic','Resolved']
best_params = {'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'class_weight':{True:4, False:1},
 'bootstrap': False}

filtered_params['class_weight'] = {True:4, False:1}


def calc_is_sick(merge_df, filtered_merge_df,meta_idx, filtered_meta_idx, best_params, filtered_params, verbose=True, train_ratio = 0.7):
    
    
    num_runs = 1
    train_ratio = train_ratio
    merge_df_tt = get_merged_tt_df(merge_df,split_control=False,train_ratio=train_ratio)
    filtered_merge_df = filtered_merge_df.assign(tt=merge_df_tt.tt)
    
    filtered_clf = RandomForestClassifier(**filtered_params)
    
    filtered_merge_df = filtered_merge_df.loc[~pd.isna(filtered_merge_df.symptoms)]
    filtered_merge_df = filtered_merge_df.assign(is_sick = (filtered_merge_df.symptoms == 'Symptomatic'))
    f_is_sick_acc = get_avg_accuracy(filtered_clf,filtered_merge_df,filtered_meta_idx,num_runs=num_runs,y='is_sick',train_ratio=train_ratio,verbose=verbose)

    #On all features
    clf = RandomForestClassifier(**best_params)
    merge_df = merge_df.assign(is_sick = (merge_df.symptoms == 'Symptomatic'))
    merge_df = merge_df.loc[~pd.isna(merge_df.symptoms)]
    is_sick_acc = get_avg_accuracy(clf,merge_df,meta_idx,num_runs=num_runs,y='is_sick',train_ratio=train_ratio,verbose=verbose)

    if verbose: 
        print("\nRandom Accuracy")
        filtered_merge_df['is_sick_str'] = filtered_merge_df.is_sick.astype(str)
        res = get_avg_classification_random_accuracy(filtered_merge_df,num_runs, label_name='is_sick_str', train_ratio=train_ratio,verbose=verbose)
        print(res)
        
    return filtered_merge_df, merge_df, clf, filtered_clf

def draw_acc_count_heatmap(count_np, name, precentage_np, tbl, title=None, add_title=True, text_size = 18):
    c = 'brwnyl'
    x = tbl.columns.astype(str).tolist()
    y = tbl.index.astype(str).tolist()
    fig_base = make_subplots(rows=2, cols=1)
    zmax = precentage_np.max()
    
    fig_percent = ff.create_annotated_heatmap(
        precentage_np, x=x, y=y, colorscale=c, zmin=zmax/2, zmax=zmax)
    fig_percent['layout']['xaxis']['side'] = 'top'
    fig_base.append_trace(fig_percent['data'][0], 1, 1)
    fig_cnt = ff.create_annotated_heatmap(precentage_np, annotation_text=count_np, x=x,
                                          y=y, colorscale=fig_percent.data[0]['colorscale'], zmin=zmax/2, zmax=zmax,
                                          zauto=False)
    fig_cnt['layout']['xaxis']['side'] = 'top'
    fig_base.append_trace(fig_cnt['data'][0], 2, 1)
    # Add annotation to the plot
    annot1 = list(fig_percent.layout.annotations)
    annot2 = list(fig_cnt.layout.annotations)
    for k in range(len(annot2)):
        annot2[k]['xref'] = 'x2'
        annot2[k]['yref'] = 'y2'
    fig_base.update_layout(annotations=annot1 + annot2)
    fig_base['layout']['xaxis']['side'] = 'top'
    if add_title:
        if title is not None:
            title_text=f'{name} Model Prediction - Symptoms(rows)/is_sick(cols)'
        else:
            title_text = title
        fig_base.update_layout(title_text)
     
    fig_base.update_layout(font=dict(size=text_size))
    return fig_base

In [192]:
# best_params = {'random_state': 666,
#  'n_estimators': 1200,
#  'min_samples_split': 20,
#  'min_samples_leaf': 2,
#  'max_features': 'sqrt',
#  'max_depth': None,
#  'class_weight':{True:9, False:1},
#  'bootstrap': False}

# num_runs = 1
# train_ratio = 0.7

# filtered_params['class_weight'] = {True:9, False:1}
# merge_df_tt = get_merged_tt_df(merge_df,split_control=False,train_ratio=train_ratio)
# filtered_merge_df = filtered_merge_df.assign(tt=merge_df_tt.tt)


# filtered_clf = RandomForestClassifier(**filtered_params)
# filtered_merge_df = filtered_merge_df.loc[~pd.isna(filtered_merge_df.symptoms)]
# filtered_merge_df = filtered_merge_df.assign(is_sick = (filtered_merge_df.symptoms == 'Symptomatic'))
# f_is_sick_acc = get_avg_accuracy(filtered_clf,filtered_merge_df,filtered_meta_idx,num_runs=num_runs,y='is_sick',train_ratio=train_ratio)

# #On all features
# clf = RandomForestClassifier(**best_params)
# merge_df = merge_df.assign(is_sick = (merge_df.symptoms == 'Symptomatic'))
# merge_df = merge_df.loc[~pd.isna(merge_df.symptoms)]
# is_sick_acc = get_avg_accuracy(clf,merge_df,meta_idx,num_runs=num_runs,y='is_sick',train_ratio=train_ratio)


# print("\nRandom Accuracy")
# filtered_merge_df['is_sick_str'] = filtered_merge_df.is_sick.astype(str)
# get_avg_classification_random_accuracy(filtered_merge_df,num_runs, label_name='is_sick_str', train_ratio=train_ratio)



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:02<00:00,  2.40s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Train AVG accuracy 0.7271386430678466
Test AVG accuracy 0.5229007633587787



100%|██████████| 1/1 [00:02<00:00,  2.88s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Train AVG accuracy 0.9836065573770492
Test AVG accuracy 0.7625418060200669

Random Accuracy



100%|██████████| 1/1 [00:00<00:00,  4.81it/s][A


(0.6585956416464891, 0.6304347826086957)

In [74]:
# # clf, merge_df, meta_idx, y_label, pivot_label
# precentage_avg, count_avg, filt_precentage_avg, filt_count_avg = get_avg_tables(
#     2, merge_df, filtered_merge_df,meta_idx, filtered_meta_idx,  best_params, filtered_params, 
#     y_label = 'is_sick',
#     pivot_label = 'symptoms', norm_axis=1, round_n=2,
#     name='Filtered')

NameError: name 'y_label' is not defined

In [53]:
filtered_params['class_weight'] = {True:2000, False:1}
best_params['class_weight'] = {True:2000, False:1}
filtered_merge_df, merge_df, clf, filtered_clf = calc_is_sick(merge_df, filtered_merge_df,meta_idx, filtered_meta_idx, best_params, filtered_params, verbose=True)

100%|██████████| 1/1 [00:02<00:00,  2.43s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Train AVG accuracy 0.4785185185185185
Test AVG accuracy 0.43018867924528303


100%|██████████| 1/1 [00:07<00:00,  7.61s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Train AVG accuracy 0.8984485190409027
Test AVG accuracy 0.68

Random Accuracy


100%|██████████| 1/1 [00:00<00:00,  4.96it/s]

(0.6360153256704981, 0.6574585635359116)





In [49]:
#DVIRI

filtered_test_df = get_labeled_predicion(filtered_clf, filtered_merge_df,
                                         filtered_meta_idx, y_label = 'is_sick',
                                         pivot_label = 'symptoms', norm_axis=1, round_n=2,
                                        name='Filtered',sorted_index=lbl)

predict          False     True 
bin_label                       
Control       0.231343  0.768657
Pre-symptoms  0.000000  1.000000
Symptomatic   0.105263  0.894737
Resolved      0.545455  0.454545



predict       False  True 
bin_label                 
Control          31    103
Pre-symptoms      0     20
Symptomatic       4     34
Resolved         42     35


In [29]:
# Draw the table with full code
# lbl = ['Control', 'Pre-symptoms', 'Symptomatic','Resolved']
# clf=filtered_clf
# merge_df=filtered_merge_df
# meta_idx=filtered_meta_idx
# y_label='is_sick'
# pivot_label='symptoms'
# tt='test'
# norm_axis=1
# round_n=2
# name='Filtered'
# return_tables=False
# verbose=True
# sorted_index=lbl


# test_df, X_test, y_test = get_cls_prediction(
#     clf, merge_df, meta_idx, y_label=y_label, tt=tt)
# pred = test_df.pred.values
# label = test_df[pivot_label]

# two_cls_pred = [str(p) for p in pred]
# two_cls_y_test = [str(p) for p in label]
# #     print(f"Got test accuracy {accuracy_score(two_cls_y_test, two_cls_pred)}")

# # test_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test) #type: pd.DataFrame
# pred_df = pd.DataFrame().assign(pred=pred, bin_pred=two_cls_pred,
#                                 bin_label=two_cls_y_test, label=y_test)  # type: pd.DataFrame
# t = pred_df.melt(id_vars=['bin_label', 'label'],
#                  value_vars=['pred'], value_name='predict')
# tbl = t.groupby(['bin_label', 'predict'])[
#     'variable'].agg('count').unstack().fillna(0)

# percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1-norm_axis)

# #dviri
# if sorted_index is not None:
#     percentage_tbl.sort_index(key=lambda x: [sorted_index.index(val) for val in x],inplace=True)
#     tbl.sort_index(key=lambda x: [sorted_index.index(val) for val in x],inplace=True)

# precentage_np = percentage_tbl.to_numpy().round(round_n)
# count_np = tbl.to_numpy().round(round_n)

# # if not verbose: 
# #     if return_tables:
# #         return precentage_np, count_np
# #     else:
# #         return test_df

# print(percentage_tbl)
# print("\n\n")
# print(tbl.astype(int))

# c = 'brwnyl'
# x = tbl.columns.astype(str).tolist()
# y = tbl.index.astype(str).tolist()
# fig_base = make_subplots(rows=2, cols=1)


# fig_percent = ff.create_annotated_heatmap(
#     precentage_np, x=x, y=y, colorscale=c, zmin=0.5, zmax=1.0)
# fig_percent['layout']['xaxis']['side'] = 'top'
# fig_base.append_trace(fig_percent['data'][0], 1, 1)

# fig_cnt = ff.create_annotated_heatmap(precentage_np, annotation_text=count_np, x=x,
#                                       y=y, colorscale=fig_percent.data[0]['colorscale'], zmin=0.5, zmax=1, zauto=False)

# fig_cnt['layout']['xaxis']['side'] = 'top'
# fig_base.append_trace(fig_cnt['data'][0], 2, 1)

# # Add annotation to the plot
# annot1 = list(fig_percent.layout.annotations)
# annot2 = list(fig_cnt.layout.annotations)
# for k in range(len(annot2)):
#     annot2[k]['xref'] = 'x2'
#     annot2[k]['yref'] = 'y2'

# fig_base.update_layout(annotations=annot1+annot2)
# fig_base['layout']['xaxis']['side'] = 'top'
# fig_base.update_layout(
#     title_text=f'{name} Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))
# fig_base.show()

In [54]:
test_df = get_labeled_predicion(clf, merge_df, meta_idx, y_label = 'is_sick',
                                pivot_label = 'symptoms', tt='test', norm_axis=1,
                               name="Full",sorted_index=lbl)

predict          False     True 
bin_label                       
Control       0.732558  0.267442
Pre-symptoms  0.454545  0.545455
Symptomatic   0.404762  0.595238
Resolved      0.760000  0.240000



predict       False  True 
bin_label                 
Control          63     23
Pre-symptoms     10     12
Symptomatic      17     25
Resolved         38     12


#### Raw code. Keep for later 

In [25]:

test_df, X_test, y_test = get_cls_prediction(filtered_clf,filtered_merge_df,filtered_meta_idx,y_label='is_sick')
pred = test_df.pred.values
label = test_df.symptoms

two_cls_pred = [str(p) for p in pred]
two_cls_y_test = [str(p) for p in  label]  
print(f"Got test accuracy {accuracy_score(two_cls_y_test, two_cls_pred)}")

# test_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test) #type: pd.DataFrame
pred_df = pd.DataFrame().assign(pred=pred,bin_pred=two_cls_pred,bin_label=two_cls_y_test, label=y_test) #type: pd.DataFrame
t = pred_df.melt(id_vars=['bin_label', 'label'],value_vars=['pred'],value_name='predict')
tbl = t.groupby(['bin_label','predict'])['variable'].agg('count').unstack().fillna(0)
label_tbl = t.groupby(['bin_label','label'])['variable'].agg('count').unstack().fillna(0)
# col_map = {n: f'{n} ({cat_to_bin(classes_arr)(n)})' for n in tbl.columns}
# tbl = tbl.reindex(DEFAULT_ST_SORTED_ARRAY,axis=1)
# tbl = tbl.rename(columns=col_map)

# print(tbl.div(tbl.sum(axis=1), axis=0))
# print("\n\n")
# print(tbl.astype(int))

Got test accuracy 0.0


In [91]:
c = 'brwnyl'
x=tbl.columns.astype(str).tolist()
y=tbl.index.astype(str).tolist()
percentage_tbl = tbl.div(tbl.sum(axis=1), axis=0)
percentage_np = percentage_tbl.to_numpy()
fig_base = make_subplots(rows=2, cols=1)
fig_percent = ff.create_annotated_heatmap(percentage_np,x=x,y=y,colorscale=c, zmin=0.5, zmax=1, zauto=False)
fig_percent.update_layout(title_text='Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))
fig_base.append_trace(fig_percent['data'][0], 1, 1)

fig_cnt = ff.create_annotated_heatmap(percentage_np, annotation_text = tbl.to_numpy(),x=x,y=y,colorscale=c, zmin=0.5, zmax=1, zauto=False)
fig_cnt.update_layout(title_text='Count Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))
fig_base.append_trace(fig_cnt['data'][0], 2 , 1)

annot1 = list(fig_percent.layout.annotations)
annot2 = list(fig_cnt.layout.annotations)
for k  in range(len(annot2)):
    annot2[k]['xref'] = 'x2'
    annot2[k]['yref'] = 'y2'
fig_base.update_layout(annotations=annot1+annot2)  
fig_base['layout']['xaxis']['side'] = 'top'
fig_base.update_layout(title_text='Count Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))
fig_base

### Predict Symptoms

In [12]:
filtered_params, best_params = get_classifications_params()
filtered_merge_df, filtered_meta_idx = get_filtered_data(norm_l7_path, best_features)
filtered_merge_df = filtered_merge_df.loc[~pd.isna(filtered_merge_df.symptoms)]
merge_df = merge_df.loc[~pd.isna(merge_df.symptoms)]
symptoms_lst = ['Control','Pre-symptoms','Symptomatic','Resolved']

In [265]:
ct.columns.tolist()

['Pre-symptoms', 'Symptomatic', 'Resolved', 'Control']

In [13]:
symp_best_params = best_params.copy()
symp_filtered_best_params = filtered_params.copy()
symp_filtered_best_params['class_weight'] = {'Control':0.5, "Pre-symptoms":2, 'Resolved':1, 'Symptomatic':1}
symp_best_params['class_weight'] = {'Control':0.5, "Pre-symptoms":2, 'Resolved':1, 'Symptomatic':1}

num_runs = 1
train_ratio = 0.7
merge_df_tt = get_merged_tt_df(merge_df,split_control=False,train_ratio=train_ratio)
filtered_merge_df = filtered_merge_df.assign(tt=merge_df_tt.tt)

filtered_clf = RandomForestClassifier(**symp_filtered_best_params)
f_is_ap_acc = get_avg_accuracy(filtered_clf,filtered_merge_df,filtered_meta_idx,num_runs=num_runs,y='symptoms',train_ratio=train_ratio)
print(f_is_ap_acc)

# merge_df_tt = merge_df_tt.groupby('record_id').apply(lambda x: x.assign(is_ap= (x.symptoms == 'Symptomatic').any()).reset_index(drop=True)).reset_index(drop=True)
clf = RandomForestClassifier(**symp_best_params)
is_ap_acc = get_avg_accuracy(clf,merge_df_tt,meta_idx,num_runs=num_runs,y='symptoms',train_ratio=train_ratio)
# is_ap_acc = get_classifier_accuracy(clf,merge_df_tt,meta_idx,y='is_ap',split_tt=False)
print(is_ap_acc)

print("\nRandom Accuracy")
get_avg_classification_random_accuracy(filtered_merge_df,num_runs, label_name='symptoms')


100%|██████████| 1/1 [00:02<00:00,  2.32s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Train AVG accuracy 0.6965742251223491
Test AVG accuracy 0.308868501529052
(0.6965742251223491, 0.308868501529052)


100%|██████████| 1/1 [00:03<00:00,  3.07s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Train AVG accuracy 0.9789473684210527
Test AVG accuracy 0.4983277591973244
(0.9789473684210527, 0.4983277591973244)

Random Accuracy


100%|██████████| 1/1 [00:00<00:00,  4.94it/s]


(0.3342541436464088, 0.30416666666666664)

In [44]:
symp_best_params = best_params.copy()
symp_filtered_best_params = filtered_params.copy()
symp_filtered_best_params['class_weight'] = {'Control':0.5, "Pre-symptoms":1, 'Resolved':1, 'Symptomatic':1}
symp_best_params['class_weight'] = {'Control':0.33, "Pre-symptoms":1, 'Resolved':1, 'Symptomatic':1}
y='symptoms'

clf = RandomForestClassifier(**symp_best_params)
train_acc, test_acc = get_classifier_accuracy(clf, merge_df_tt,meta_idx,y=y,train_ratio=train_ratio, split_tt=False )
print(train_acc, test_acc)
# is_ap_acc = get_avg_accuracy(clf,merge_df_tt,meta_idx,num_runs=num_runs,y='symptoms',train_ratio=train_ratio)

filtered_clf = RandomForestClassifier(**symp_filtered_best_params)
train_acc, test_acc = get_classifier_accuracy(filtered_clf, filtered_merge_df,filtered_meta_idx,y=y,train_ratio=train_ratio, split_tt=False )
print(train_acc, test_acc)
# f_is_ap_acc = get_avg_accuracy(filtered_clf,filtered_merge_df,filtered_meta_idx,num_runs=num_runs,y='symptoms',train_ratio=train_ratio, split_tt=False)

0.9353383458646617 0.44816053511705684
0.765089722675367 0.3547400611620795


In [191]:
# label_col = 'symptoms'
# res = get_classifier_accuracy(clf, merge_df_tt, meta_idx, y=label_col,train_ratio=train_ratio, split_tt=True, return_all=True)
# test_pred = res['test_pred']
# test_df = res['test_df']

# # ct,cnt = get_confusion_matrix(test_df,test_df.pred,test_df[label_col].unique().tolist(), label_col=label_col, return_count=True, normalize = 'index')
# pred = test_df.pred
# classes_arr = test_df[label_col].unique().tolist()
# normalize='index'

# df_mapping = pd.DataFrame({label_col: classes_arr})
# sort_mapping = df_mapping.reset_index().set_index(label_col)

# pred_merge_df = test_df.copy()
# pred_merge_df['pred_names'] = pred
# pred_merge_df['label_num'] = pred_merge_df[label_col].map(sort_mapping['index'])
# #     pred_merge_df['kmeans_label'].unique(),pred_merge_df['sample_time_num'].unique()
# pred_merge_df['pred_names_num'] = pred_merge_df['pred_names'].map(sort_mapping['index'])

# label_s = pd.Series(pred_merge_df.label_num, name='label')
# pred_s = pd.Series(pred_merge_df.pred_names_num, name='pred')

# ct = pd.crosstab(pred_s, label_s, normalize=normalize)
# mapping_dict = df_mapping.to_dict()[label_col]
# ct = ct.rename(columns=mapping_dict, index=mapping_dict)

# cnt = pd.crosstab(label_s,pred_s)
# cnt = cnt.rename(columns=mapping_dict, index=mapping_dict)


In [379]:
merge_df_tt.tt.value_counts()

train    683
test     281
Name: tt, dtype: int64

In [32]:

label_col = 'symptoms'
round_n = 3
res = get_classifier_accuracy(clf, merge_df_tt, meta_idx, y=label_col,train_ratio=train_ratio, split_tt=False, return_all=True, )
test_df = res['test_df']
print(test_df.shape)
ct,cnt = get_confusion_matrix(test_df,meta_idx, test_df.pred,symptoms_lst, 
                              label_col=label_col, return_count=True, normalize = 'columns',split_tt=False, res_data = res )

# percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1 - norm_axis)
precentage_np = ct.to_numpy().round(round_n)
count_np = cnt.to_numpy().round(round_n)

name = "testing"
draw_acc_count_heatmap(count_np, name, precentage_np, ct, title='', add_title=False )

(299, 516)


In [36]:
label_col = 'symptoms'
round_n = 3
res = get_classifier_accuracy(clf, merge_df_tt, meta_idx, y=label_col,train_ratio=train_ratio, split_tt=False, return_all=True, )
test_df = res['test_df']
print(test_df.shape)
ct,cnt = get_confusion_matrix(test_df,meta_idx, test_df.pred,symptoms_lst, 
                              label_col=label_col, return_count=True, normalize = 'index',split_tt=False, res_data = res )

# percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1 - norm_axis)
precentage_np = ct.to_numpy().round(round_n)
count_np = cnt.to_numpy().round(round_n)

name = "testing"
draw_acc_count_heatmap(count_np, name, precentage_np, ct, title='', add_title=False )

(299, 516)


In [45]:
label_col = 'symptoms'
round_n = 3
res = get_classifier_accuracy(filtered_clf, filtered_merge_df, filtered_meta_idx, y=label_col,train_ratio=train_ratio, split_tt=False, return_all=True)
test_df = res['test_df']
print(filtered_merge_df.shape)
ct,cnt = get_confusion_matrix(test_df,filtered_meta_idx, test_df.pred,symptoms_lst, label_col=label_col, return_count=True,
                              normalize = 'columns', split_tt=False, res_data = res)

# percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1 - norm_axis)
precentage_np = ct.to_numpy().round(round_n)
count_np = cnt.to_numpy().round(round_n)

name = "testing"
draw_acc_count_heatmap(count_np, name, precentage_np, ct, title='', add_title=False )

(964, 39)


In [46]:
label_col = 'symptoms'
round_n = 3
res = get_classifier_accuracy(filtered_clf, filtered_merge_df, filtered_meta_idx, y=label_col,train_ratio=train_ratio, split_tt=False, return_all=True)
test_df = res['test_df']
print(filtered_merge_df.shape)
ct,cnt = get_confusion_matrix(test_df,filtered_meta_idx, test_df.pred,symptoms_lst, label_col=label_col, return_count=True,
                              normalize = 'index', split_tt=False, res_data = res)

# percentage_tbl = tbl.div(tbl.sum(axis=norm_axis), axis=1 - norm_axis)
precentage_np = ct.to_numpy().round(round_n)
count_np = cnt.to_numpy().round(round_n)

name = "testing"
draw_acc_count_heatmap(count_np, name, precentage_np, ct, title='', add_title=False )

(964, 39)


In [16]:
np.unique(res['test_pred'],return_counts=True)

(array(['Control', 'Pre-symptoms', 'Resolved', 'Symptomatic'], dtype=object),
 array([ 80,  63, 101,  73]))

In [237]:
# tbl = cnt
# c = 'brwnyl'
# x = tbl.columns.astype(str).tolist()
# y = tbl.index.astype(str).tolist()
# fig_base = make_subplots(rows=2, cols=1)
# fig_percent = ff.create_annotated_heatmap(
#     precentage_np, x=x, y=y, colorscale=c,zmin=precentage_np.max()/2, zmax=precentage_np.max())
# fig_percent['layout']['xaxis']['side'] = 'top'
# fig_base.append_trace(fig_percent['data'][0], 1, 1)
# fig_cnt = ff.create_annotated_heatmap(precentage_np, annotation_text=count_np, x=x,
#                                       y=y, colorscale=fig_percent.data[0]['colorscale'], zmid=1,
#                                       zauto=True)
# # fig_cnt['layout']['xaxis']['side'] = 'top'
# fig_base.append_trace(fig_cnt['data'][0], 2, 1)
# # Add annotation to the plot
# annot1 = list(fig_percent.layout.annotations)
# annot2 = list(fig_cnt.layout.annotations)
# for k in range(len(annot2)):
#     annot2[k]['xref'] = 'x2'
#     annot2[k]['yref'] = 'y2'
    
# #     annot1[k]['xref'] = 'x1'
# #     annot1[k]['yref'] = 'y1'

# new_annot = list()
# new_annot.extend(annot1)
# new_annot.extend(annot2)

# fig_base.update_layout(annotations=new_annot)
# # fig_base['layout']['xaxis']['side'] = 'top'
# fig_base.update_layout(
#     title_text=f'{name} Model Prediction - Symptoms(rows)/is_sick(cols)', font=dict(size=18))

In [206]:
label_col = 'symptoms'
res = get_classifier_accuracy(clf, merge_df_tt, meta_idx, y=label_col,train_ratio=train_ratio, split_tt=False, return_all=True)
test_df = res['test_df']

ct,cnt = get_confusion_matrix(test_df,test_df.pred,test_df[label_col].unique().tolist(), label_col=label_col, return_count=True, normalize = 'index')

In [200]:
filtered_test_df = get_labeled_predicion(filtered_clf, filtered_merge_df,
                                         filtered_meta_idx, y_label = 'symptoms',
                                         pivot_label = 'symptoms', norm_axis=1, round_n=2,
                                        name='Filtered')

predict        Control  Pre-symptoms  Resolved  Symptomatic
bin_label                                                  
Control       0.782353      0.005882  0.141176     0.070588
Pre-symptoms  0.730769      0.000000  0.230769     0.038462
Resolved      0.640777      0.009709  0.252427     0.097087
Symptomatic   0.808219      0.000000  0.150685     0.041096



predict       Control  Pre-symptoms  Resolved  Symptomatic
bin_label                                                 
Control           133             1        24           12
Pre-symptoms       19             0         6            1
Resolved           66             1        26           10
Symptomatic        59             0        11            3


In [338]:
filtered_tt_df = get_merged_tt_df(filtered_merge_df,split_control=False,train_ratio=0.8)
train_df = filtered_tt_df[filtered_tt_df.tt == 'train']
X_train = train_df.iloc[:,:filtered_meta_idx].values
y_train = train_df.symptoms.values
parameter_search_classifier(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.0min finished


{'random_state': 666,
 'n_estimators': 1200,
 'min_samples_split': 5,
 'min_samples_leaf': 10,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

# Dimensionallity Reduction

In [76]:
from scipy.spatial import distance
import skbio

metrics = ['braycurtis', 'canberra', 'chebyshev', 'cityblock',
    'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
    'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching',
    'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
    'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']

def pcoa_dim_reduction(df,meta_idx,metric='braycurtis', i=1, j=2, k=3):
    """
    metric: options are 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
    'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
    'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching',
    'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
    'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
    """
    X = df.iloc[:, :meta_idx]
    Ar_dist = distance.squareform(distance.pdist(X, metric="braycurtis"))  # (m x m) distance measure
    DM_dist = skbio.stats.distance.DistanceMatrix(Ar_dist, ids=X.index)
    PCoA = skbio.stats.ordination.pcoa(DM_dist, number_of_dimensions=6)

    PCoA_samples_df = PCoA.samples
    dims = PCoA_samples_df.shape[1]

    PCoA_samples_df = PCoA.samples
    dims = PCoA_samples_df.shape[1]
    col = 'symptoms'
    # col = 'record_id'

    embedded_X = PCoA_samples_df.rename(columns={f"PC{i}": 'x', f"PC{j}": 'y', f"PC{k}": 'z'})
    plot_df = pd.concat([df,embedded_X],axis=1)

    embedded_X = embedded_X.assign(**{col:df[col]})
    # plot_df.record_id = plot_df.record_id.astype(str)
    # clean_df = plot_df[~plot_df[col].isna()].copy()
    embedded_X = embedded_X[~embedded_X[col].isna()].copy()
    fig = px.scatter_3d(embedded_X, x='x', y='y', z='z', color=col)
    return fig


**Dimensionallity  reduction over full data** 


In [386]:
merge_df, meta_idx = get_gmap_data(norm_l7_path)
filtered_merge_df = merge_df.copy()
filtered_meta_idx = meta_idx
# filtered_tt_df = filtered_tt_df.reset_index().rename(columns={'index':'sample_name'})
# filtered_tt_df = filtered_merge_df.reset_index().rename(columns={'index':'sample_name'})
# filtered_data_df = filtered_tt_df.iloc[:,:filtered_meta_idx]
# filtered_data_df = filtered_data_df.div(filtered_data_df.sum(axis=1),axis=0).fillna(0)  # normalize to 1 

pcoa_dim_reduction(filtered_merge_df,meta_idx,metric='braycurtis')
