# Comparing Features from Different Songbird Models
The machine learning models for the current analysis have been performing worse compared with the old analysis. Here we will compare the feature sets to see how they differ. The models have been built using, 10 train/10 test samples, 70/30 split test samples, and the samples used when building the model from the old analysis

In [1]:
import pandas as pd
import qiime2
import numpy as np
import os

os.chdir('/Users/tgroth/Google Drive/knight_twin_NAFLD/fecal_analysis')

## Loading in Model Differentials

In [6]:
# loading in best performing model built off ten pos and 10 neg for AF
ten_tt = qiime2.Artifact.load('songbird_analysis/f_ATTRIBUTE_adv_fibrosis_dp_0.75_lr_0.0001_e_5000/differentials.qza')
ten_tt_df = ten_tt.view(pd.DataFrame)
ten_tt_df.sort_values(by=['ATTRIBUTE_adv_fibrosis[T.1]'],ascending=False,inplace=True)
ten_tt_df.head(3)

Unnamed: 0_level_0,Intercept,ATTRIBUTE_adv_fibrosis[T.1],ATTRIBUTE_adv_fibrosis[T.Missing: Not collected]
featureid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f16fb9a4556296869e4f5e582a914dd2,-1.148374,2.699293,0.037231
b88289a6da54ad61fca21f4ecab92fd9,0.035948,2.607767,0.050815
4c25b8be76070246953849e3931a78c6,-1.103566,2.511959,0.050815


In [4]:
# loading in best performing model built off a 70/30 train test split that maintained the pos/neg distribution
split_tt = qiime2.Artifact.load('songbird_analysis/songbird_traintest_7030/f_ATTRIBUTE_adv_fibrosis_dp_5_lr_0.001_e_5000/differentials.qza')
split_tt_df = split_tt.view(pd.DataFrame)
split_tt_df.sort_values(by=['ATTRIBUTE_adv_fibrosis[T.1]'],ascending=False,inplace=True)
split_tt_df.head(3)

Unnamed: 0_level_0,Intercept,ATTRIBUTE_adv_fibrosis[T.1],ATTRIBUTE_adv_fibrosis[T.Missing: Not collected]
featureid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cdf8d23687cfc324485e2bd3e0094ff2,-0.434705,4.605243,0.010917
fb758530086adf438d5ebe8f83847f2d,-0.498779,4.28027,0.010917
d2ef3361261cc1d835ad59394bfe161c,-0.053588,4.253552,0.010917


In [16]:
# loading in best performing model from old analysis
old_model = qiime2.Artifact.load('../old_analyses/fecal_analysis/fecal_diff_abundance_analysis/f_ATTRIBUTE_adv_fibrosis+ATTRIBUTE_groups_dp_0.4_lr_0.001_e_2500_drop/differentials.qza')
old_model_df = old_model.view(pd.DataFrame)
old_model_df.sort_values(by=['ATTRIBUTE_adv_fibrosis[T.1]'],ascending=False,inplace=True)
old_model_df.head(3)

Unnamed: 0_level_0,Intercept,ATTRIBUTE_adv_fibrosis[T.1],ATTRIBUTE_adv_fibrosis[T.Missing: Not collected],ATTRIBUTE_groups[T.G1R],ATTRIBUTE_groups[T.G2P],ATTRIBUTE_groups[T.G2R],ATTRIBUTE_groups[T.G3P],ATTRIBUTE_groups[T.G3R]
featureid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fb758530086adf438d5ebe8f83847f2d,-0.670929,3.858487,2.148028,-0.965325,0.552249,0.101298,-0.472353,1.344099
0ba75f593218ef056eefaeab285ae839,-0.188644,3.76405,1.413932,-1.381585,0.957711,0.416394,-1.125884,1.348312
cdf8d23687cfc324485e2bd3e0094ff2,-0.613116,3.744521,-6.213303,-0.994323,0.177037,0.822062,-0.882106,1.987184


## Comparing Feature Sets

In [17]:
# creating top 50 and top/bot 25 for each model
ten_top50_id = list(ten_tt_df.iloc[0:50].index)
ten_tb25_id = list(ten_tt_df.iloc[0:25].index)+list(ten_tt_df.iloc[-26:-1].index)

split_top50_id = list(split_tt_df.iloc[0:50].index)
split_tb25_id = list(split_tt_df.iloc[0:25].index)+list(split_tt_df.iloc[-26:-1].index)

old_top50_id = list(old_model_df.iloc[0:50].index)
old_tb25_id = list(old_model_df.iloc[0:25].index)+list(old_model_df.iloc[-26:-1].index)

In [14]:
# Comparing 10 train/10 test features and 70/30 split
print(len(set(ten_top50_id)&set(split_top50_id)))

print(len(set(ten_tb25_id)&set(split_tb25_id)))

16
8


In [18]:
# Comparing 10 train/10 test features and Old features
print(len(set(ten_top50_id)&set(old_top50_id)))

print(len(set(ten_tb25_id)&set(old_tb25_id)))

13
9


In [19]:
# Comparing 70/30 split and old model
print(len(set(split_top50_id)&set(old_top50_id)))

print(len(set(split_tb25_id)&set(old_tb25_id)))

18
20
