# Nobel Twin Study, Serum Data Processing
The notebook for performing any data processing or data adjusting activities.

**Author: Tobin Groth (tgroth@ucsd.edu)**

In [11]:
import pandas as pd
import qiime2
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from skbio.stats import composition

os.chdir('/Users/tgroth/Google Drive/knight_twin_NAFLD/serum_analysis')

## Creating Consistent Serum Feature Table
In the old analysis we did not have consistent samples between our analysis. Using the samples from the old combined analysis we can ensure that our fecal, serum and multi-omic analyses have the same samples. We will pull samples from the 'fecal-combined-ft.tsv' feature table found in the old multi-omic machine learning analysis folder.

In [23]:
#loading in the serum feature table
serum_table = qiime2.Artifact.load('feature_tables/merged-serum-ft.qza')
serum_tablepd = serum_table.view(pd.DataFrame)
print(serum_tablepd.shape)
serum_tablepd.head(3)

(204, 1607)


Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
CIR16-001_RE7_01_35016.mzXML,143.88,123.509,274.8745,514.48,28846.769,38047.827,173.628,21267.058,91085.697,46995.3435,...,583.38,161.963,700.9985,481.967,240.826,125.115,71.2725,43163.3105,76.44,107.8605
CIR18-001_RE5_01_35014.mzXML,138.557,223.51,491.405,0.0,56232.903,367150.6315,0.0,68301.2625,126465.533,12161.54,...,313.056,234.995,342.19,820.5335,208.992,116004.6005,80.993,99598.004,0.0,0.0
CIR20-002_RD11_01_35008.mzXML,514.733,3695.693,298.3725,0.0,82255.611,17912.0,183.036,30078.326,215435.746,54042.601,...,916.995,344.668,324.732,1094.55,837.0755,9244.004,179.6645,32150.0235,0.0,112.3215


In [24]:
#combined feature table from old analysis
comb_samp = pd.read_csv('../old_analyses/combined_analysis/multiomic_ml_analysis/fecal-combined-ft.tsv',sep='\t',index_col=0)
comb_samp.head(3)

Unnamed: 0,fb758530086adf438d5ebe8f83847f2d,0ba75f593218ef056eefaeab285ae839,cdf8d23687cfc324485e2bd3e0094ff2,d2ef3361261cc1d835ad59394bfe161c,4deeef4aa3231abf469afec860b7b1ea,f2f3dfa3c15062c6cc6fdda001f9ab52,db77705855bc4611e9c4fedca4b4f926,c0095c69fee5c767903814b366fcb860,0366e39a2c8c5aeaa60e42839a8b33e9,31b4a92e41933b13e3182509ebcd71e6,...,TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGATAGGTCAGTCTGTCTTAAAAGTTCGGGGCTTAACCCCGTGATGGGATGGAAACTGCCAATCTAGAGTATCGGAGAGGAAAGTGGAATTCCTAGT,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTAAGCAAGTCTGAAGTGAAAGCCCGGGGCTCAACCCCGGTACTGCTTTGGAAACTGTTTGACTTGAGTGCAGGAGAGGTAAGTGGAATTCCTAG,TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCCGGCAAGTCAGATGTGAAATCTGGAGGCTTAACCTCCAAACTGCATTTGAAACTGTAGGTCTTGAGTACCGGAGAGGTTATCGGAATTCCTTG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAAGAGCAAGTCTGATGTGAAAGGCTGGGGCTTAACCCCAGGACTGCATTGGAAACTGTTTTTCTAGAGTGCCGGAGAGGTAAGCGGAATTCCTAG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,TACGTAGGTGGCAAGCGTTATCCGGAATCATTGGGCGTAAAGGGTGCGTAGGTGGCGTACTAAGTCTGTAGTAAAAGGCAATGGCTCAACCATTGTAAGCTATGGAAACTGGTATGCTGGAGTGCAGAAGAGGGCGATGGAATTCCATGT,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCGGCAGAGGTCTGTGGTGAAAGACTGAAGCTTAACTTCAGTAAGCCATAGAAACCGGGCTGCTAGAGTGCAGGAGAGGATCGTGGAATTCCATGT,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCATGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAAGCAAGTCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTTTTGCTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAG
TW-CO-002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,14.0,0.0,19.0,10.0,0.0,0.0
TW-BT-001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
TW-CA-002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,3.0,0.0,0.0


In [25]:
#change index to CIR#-00# or TW-XX-00#
serum_copy = serum_tablepd.copy()
serum_idx = list(serum_copy.index)
serum_idx_adj = [idx[:idx.find('_')] for idx in serum_idx]
serum_copy.index = serum_idx_adj
serum_filt = serum_copy.loc[comb_samp.index]
print(serum_filt.shape)
serum_filt.head(3)

(173, 1607)


Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
TW-CO-002,13482.9175,1728.547,39264.714,198.107,152.32,703.892,8193.679,1358.4665,2225.743,29210.1555,...,26854.649,25479.218,107.163,42909.374,19609.6605,4563.516,2375.858,83236.095,391.188,3154.1725
TW-BT-001,18231.127,238.272,65264.7285,440.008,155.797,0.0,74410.0835,1227.279,1206.9795,51765.6815,...,25581.0385,33933.565,14877.4745,40394.059,39989.3425,12248.6935,6332.2265,578582.094,3390.1785,8678.9145
TW-CA-002,19558.123,174.423,109720.61,0.0,0.0,0.0,128846.6325,2199.139,6371.227,61950.6475,...,30272.513,28787.334,19124.6915,40433.607,18785.226,36733.7495,6905.1055,85905.816,95.787,46600.2255


In [26]:
#there are more than 171 samples, gotta drop duplicates
dup = serum_filt.index.duplicated(keep=False)
to_drop = list(serum_filt.index[dup])
#CIR4-001 is Missing for BMI, will drop as well
#FS-CIR2-002 Is missing feature data, will drop
to_drop.append('CIR4-001')
to_drop.append('FS-CIR2-002')
to_drop

['TW-BI-001', 'TW-BI-001', 'TW-DI-002', 'TW-DI-002', 'CIR4-001', 'FS-CIR2-002']

In [27]:
serum_filt_drop = serum_filt.drop(to_drop)
serum_filt_drop.shape

(167, 1607)

**Final number of samples for all feature tables is 167 total samples**

In [28]:
serum_filt_drop.to_csv('./feature_tables/serum-ft-matched.tsv',sep='\t')

In [29]:
serum_filt_drop.T.to_csv('./feature_tables/serum-ft-matched_T.tsv',sep='\t')

## Creating matched metadata
Current metadata has a few more samples. Will create a matched metadata that only contains the samples within our matched feature tables.

In [11]:
#loading in the serum matched table
serum_matched = pd.read_csv('./feature_tables/serum-ft-matched.tsv',sep='\t',index_col=0)
matched_samples = serum_matched.index
matched_samples

Index(['TW-CO-002', 'TW-BT-001', 'TW-CA-002', 'TW-BD-002', 'TW-AV-001',
       'TW-BS-002', 'TW-BJ-001', 'TW-CC-002', 'TW-BU-002', 'TW-BO-002',
       ...
       'CIR18-002', 'CIR41-001', 'CIR63-002', 'CIR19-001', 'CIR5-001',
       'CIR55-002', 'CIR26-002', 'CIR5-002', 'CIR55-001', 'CIR7-001'],
      dtype='object', length=167)

In [12]:
meta = pd.read_csv('../metadata.tsv',sep='\t',index_col=0)
meta.head(3)

Unnamed: 0_level_0,ATTRIBUTE_AgeInYears,ATTRIBUTE_BarcodeSequence,ATTRIBUTE_BiologicalSex,ATTRIBUTE_ChromatographyAndPhase,ATTRIBUTE_ComorbidityListDOIDIndex,ATTRIBUTE_Country,ATTRIBUTE_DOIDCommonName,ATTRIBUTE_DOIDOntologyIndex,ATTRIBUTE_DepthorAltitudeMeters,ATTRIBUTE_Description,...,ATTRIBUTE_waist_circumference_cm,ATTRIBUTE_water_lot,ATTRIBUTE_wbc,ATTRIBUTE_weight,ATTRIBUTE_weight_units,ATTRIBUTE_well_description,ATTRIBUTE_well_id,ATTRIBUTE_zygocity,ATTRIBUTE_zygoticity,filename
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FS-CIR2-002,30,ATACATGCAAGA,female,not applicable,not applicable,United States of America,not applicable,not applicable,200,sample FS.CIR2.002,...,not provided,RNBF9925,not provided,,,Loomba_Twin_Plate_1_FS.CIR2.002_D7,D7,not applicable,,FS-CIR2-002_RF10_01_35031.mzXML
TW-AA-002,73,AATTTAGGTAGG,female,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,94,RNBF9925,6.7,66.8,kg,Loomba_Twin_Plate_1_TW.AA.002_A8,A8,1,1.0,TW-AA-002_RG11_01_35929.mzXML
TW-AE-002,71,GAAATCTTGAAG,female,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,114.8,RNBF9925,7.5,108.1,kg,Loomba_Twin_Plate_1_TW.AE.002_C11,C11,2,2.0,TW-AE-002_RD12_01_35894.mzXML


In [13]:
#creating new matched meta data that only has matched samples
meta_copy = meta.copy()
meta_matched = meta_copy.loc[matched_samples]
meta_matched.shape

(167, 168)

In [14]:
meta_matched.to_csv('../metadata-matched.tsv',sep='\t')

## Creating Quantile Normalized Table
To perform diversity analysis, need to first normalize our feature table. Choosing to use quantile normalization, and the method found at `https://github.com/ShawnLYU/Quantile_Normalize/blob/master/quantile_norm.py`.

In [16]:
def quantileNormalize(df_input):
    df = df_input.copy()
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df

In [30]:
#reading in matched feature table
serum_table = pd.read_csv('./feature_tables/serum-ft-matched.tsv',sep='\t',index_col=0)
serum_table.head(3)

Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
TW-CO-002,13482.9175,1728.547,39264.714,198.107,152.32,703.892,8193.679,1358.4665,2225.743,29210.1555,...,26854.649,25479.218,107.163,42909.374,19609.6605,4563.516,2375.858,83236.095,391.188,3154.1725
TW-BT-001,18231.127,238.272,65264.7285,440.008,155.797,0.0,74410.0835,1227.279,1206.9795,51765.6815,...,25581.0385,33933.565,14877.4745,40394.059,39989.3425,12248.6935,6332.2265,578582.094,3390.1785,8678.9145
TW-CA-002,19558.123,174.423,109720.61,0.0,0.0,0.0,128846.6325,2199.139,6371.227,61950.6475,...,30272.513,28787.334,19124.6915,40433.607,18785.226,36733.7495,6905.1055,85905.816,95.787,46600.2255


In [31]:
#perform quantile normalization on matched table
serum_norm = quantileNormalize(serum_table)
serum_norm.head(3)

Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
TW-CO-002,131548.827276,173844.652091,110746.574801,138963.977832,27836.082454,47858.187284,69161.93775,26810.192516,19338.655662,21338.276692,...,231926.370982,167225.321157,10906.234877,190623.556965,131548.827276,26810.192516,120407.082818,146296.671266,146296.671266,134964.946334
TW-BT-001,169339.964307,87416.168504,146296.671266,213242.644106,28510.271443,167.862435,222252.400423,26060.275883,14055.220689,37000.746223,...,222252.400423,196900.655036,196900.655036,134964.946334,250680.006584,116645.653086,169339.964307,517918.084273,226830.143531,213242.644106
TW-CA-002,173844.652091,29830.93861,193629.445716,167.862435,167.862435,167.862435,295644.974419,34252.40309,38068.298296,80715.506974,...,278631.376773,181596.259682,237757.400327,137003.356327,127918.211516,169339.964307,176382.26572,148084.510465,27836.082454,517918.084273


In [32]:
#export table to .tsv, export transpose as well so can generate .qza easily
serum_norm.to_csv('./feature_tables/serum-ft-matched-qnorm.tsv',sep='\t')
serum_norm.T.to_csv('./feature_tables/serum-ft-matched-qnorm_T.tsv',sep='\t')

## Creating Min/Max (0-1) normalized Table
The quantile normalized table caused some issues with unweighted unifrac. Going to try this kind of normalization to see how it affects the beta diversity metrics.

In [3]:
#reading in matched feature table
serum_table = pd.read_csv('./feature_tables/serum-ft-matched.tsv',sep='\t',index_col=0)
serum_table.head(3)

Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
TW-CO-002,13482.9175,1728.547,39264.714,198.107,152.32,703.892,8193.679,1358.4665,2225.743,29210.1555,...,26854.649,25479.218,107.163,42909.374,19609.6605,4563.516,2375.858,83236.095,391.188,3154.1725
TW-BT-001,18231.127,238.272,65264.7285,440.008,155.797,0.0,74410.0835,1227.279,1206.9795,51765.6815,...,25581.0385,33933.565,14877.4745,40394.059,39989.3425,12248.6935,6332.2265,578582.094,3390.1785,8678.9145
TW-CA-002,19558.123,174.423,109720.61,0.0,0.0,0.0,128846.6325,2199.139,6371.227,61950.6475,...,30272.513,28787.334,19124.6915,40433.607,18785.226,36733.7495,6905.1055,85905.816,95.787,46600.2255


In [5]:
#performing min/max normalization
serum_mnorm=(serum_table-serum_table.min())/(serum_table.max()-serum_table.min())
serum_mnorm.head(3)

Unnamed: 0,004669f18768de8777ab02b5eed2165b,0063a016480cb00cc62af310cb1ca746,006bd617624c9f170df9372dd3ff6487,0073e5797097eb7c25e03ad5356a2954,00e4514c7dcfba955015cd25595af942,011e61c96c8d2521dc8b0c54cdab4076,01286b7f01dae6ab0660c79a1998ba23,0143d5c54180a2de7d38e7efa0b756b8,01459d6ea6596b8f9b91fec668d3a00b,01e2b489b271315160c5394854aef1be,...,ff4e3b424a3e71dfc49ac13a310835da,ff701fb5ca565cdb50764430730d0043,ff801064bbbe791c69467e2065cf7782,ff8674831b6662f5f7595ed2f73dfd4b,ff925d1aaea7b3964e8db94bee3e3e23,ffb84b6398f7bbc3d4166c9dcd6c6db7,ffc1f86e5f21b3eae64c287a000305b7,ffc773a4760c2c888b1752ec6cadbdd0,ffc9bea70abf3c54724a2e0dc386ce00,ffccfaf24b05a13ac8cc725a5cb93b3f
TW-CO-002,0.195921,0.074648,0.060065,0.000586,0.000748,0.001585,0.02481,0.002455,0.001106,0.098706,...,0.697173,0.190562,0.002514,0.869929,0.074822,0.014558,0.048671,0.120805,0.001912,0.004413
TW-BT-001,0.264918,0.01029,0.099932,0.001301,0.000765,0.0,0.225312,0.002218,0.0006,0.189108,...,0.664109,0.253793,0.349049,0.818602,0.152582,0.039073,0.129719,0.840108,0.016566,0.012144
TW-CA-002,0.2842,0.007533,0.168098,0.0,0.0,0.0,0.390145,0.003974,0.003165,0.229929,...,0.785904,0.215304,0.448695,0.819409,0.071676,0.11718,0.141454,0.124682,0.000468,0.065203


In [6]:
#export table to .tsv, export transpose as well so can generate .qza easily
serum_mnorm.to_csv('./feature_tables/serum-ft-matched-mnorm.tsv',sep='\t')
serum_mnorm.T.to_csv('./feature_tables/serum-ft-matched-mnorm_T.tsv',sep='\t')

## Creating Matched Hashed Serum Feature Table
The merged table served its purpose for the diversity analysis but moving foward we need to use the `serum-ft-hashed.qza` as it matches the feature data in `classified-feature-data.tsv`. We need to create a matched sample version of this feature table.

In [7]:
#loading in the serum feature table
serum_table = qiime2.Artifact.load('feature_tables/serum-ft-hashed.qza')
serum_tablepd = serum_table.view(pd.DataFrame)
print(serum_tablepd.shape)
serum_tablepd.head(3)

(204, 1847)


Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
CIR16-001_RE7_01_35016.mzXML,27970.3905,9304.344,225.055,0.0,229.425,36672.49,0.0,31117.314,3814.874,7260.7715,...,174279.189,198.0615,848.146,9553.9825,4162.1395,25861.884,0.0,0.0,16892.6505,1242258.0
CIR18-001_RE5_01_35014.mzXML,49058.4715,14758.6285,310.98,209.548,855.24,53242.5165,0.0,49483.018,2854.322,21662.63,...,323032.556,158.2675,675.497,12280.7645,5374.9315,34830.8835,52.608,76.248,16221.582,417415.1
CIR20-002_RD11_01_35008.mzXML,11790.872,9603.699,235.64,102.789,365.396,35476.7245,0.0,89133.203,1166.144,14864.368,...,283372.047,120.8235,897.9275,34665.7095,10739.628,7761.276,76.37,312.235,21897.082,585523.2


In [8]:
#combined feature table from old analysis
comb_samp = pd.read_csv('../old_analyses/combined_analysis/multiomic_ml_analysis/fecal-combined-ft.tsv',sep='\t',index_col=0)
comb_samp.head(3)

Unnamed: 0,fb758530086adf438d5ebe8f83847f2d,0ba75f593218ef056eefaeab285ae839,cdf8d23687cfc324485e2bd3e0094ff2,d2ef3361261cc1d835ad59394bfe161c,4deeef4aa3231abf469afec860b7b1ea,f2f3dfa3c15062c6cc6fdda001f9ab52,db77705855bc4611e9c4fedca4b4f926,c0095c69fee5c767903814b366fcb860,0366e39a2c8c5aeaa60e42839a8b33e9,31b4a92e41933b13e3182509ebcd71e6,...,TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGATAGGTCAGTCTGTCTTAAAAGTTCGGGGCTTAACCCCGTGATGGGATGGAAACTGCCAATCTAGAGTATCGGAGAGGAAAGTGGAATTCCTAGT,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTTAAGCAAGTCTGAAGTGAAAGCCCGGGGCTCAACCCCGGTACTGCTTTGGAAACTGTTTGACTTGAGTGCAGGAGAGGTAAGTGGAATTCCTAG,TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGCCGGCAAGTCAGATGTGAAATCTGGAGGCTTAACCTCCAAACTGCATTTGAAACTGTAGGTCTTGAGTACCGGAGAGGTTATCGGAATTCCTTG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAAGAGCAAGTCTGATGTGAAAGGCTGGGGCTTAACCCCAGGACTGCATTGGAAACTGTTTTTCTAGAGTGCCGGAGAGGTAAGCGGAATTCCTAG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,TACGTAGGTGGCAAGCGTTATCCGGAATCATTGGGCGTAAAGGGTGCGTAGGTGGCGTACTAAGTCTGTAGTAAAAGGCAATGGCTCAACCATTGTAAGCTATGGAAACTGGTATGCTGGAGTGCAGAAGAGGGCGATGGAATTCCATGT,TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCGGCAGAGGTCTGTGGTGAAAGACTGAAGCTTAACTTCAGTAAGCCATAGAAACCGGGCTGCTAGAGTGCAGGAGAGGATCGTGGAATTCCATGT,AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCATGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGG,TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAAGCAAGTCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTTTTGCTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAG
TW-CO-002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,14.0,0.0,19.0,10.0,0.0,0.0
TW-BT-001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
TW-CA-002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,3.0,0.0,0.0


In [9]:
#change index to CIR#-00# or TW-XX-00#
serum_copy = serum_tablepd.copy()
serum_idx = list(serum_copy.index)
serum_idx_adj = [idx[:idx.find('_')] for idx in serum_idx]
serum_copy.index = serum_idx_adj
serum_filt = serum_copy.loc[comb_samp.index]
print(serum_filt.shape)
serum_filt.head(3)

(173, 1847)


Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
TW-CO-002,55809.107,16961.251,76560.9505,8627.2465,9663.3085,173.5485,409293.0,16418.4475,62555.667,0.0,...,0.0,25607.2775,8736.1815,2628.3335,8261.911,115.805,200.1885,1438.5715,32596.2895,0.0
TW-BT-001,141374.9595,37664.9385,51482.278,42985.9195,944092.969,349.6905,3351079.0,29538.136,87325.661,0.0,...,0.0,53155.7155,22630.8375,5028.326,5144.877,219.7935,140.6745,2776.502,212928.942,0.0
TW-CA-002,14916.993,14055.226,40373.087,25469.6935,152745.316,169.015,683127.4,42557.0155,70843.1185,318.784,...,0.0,59003.971,29346.5395,2493.696,1345.321,0.0,2407450.0,7916.8855,22446.481,0.0


In [10]:
#there are more than 171 samples, gotta drop duplicates
dup = serum_filt.index.duplicated(keep=False)
to_drop = list(serum_filt.index[dup])
#CIR4-001 is Missing for BMI, will drop as well
#FS-CIR2-002 Is missing feature data, will drop
to_drop.append('CIR4-001')
to_drop.append('FS-CIR2-002')
to_drop

['TW-BI-001', 'TW-BI-001', 'TW-DI-002', 'TW-DI-002', 'CIR4-001', 'FS-CIR2-002']

In [11]:
serum_filt_drop = serum_filt.drop(to_drop)
serum_filt_drop.shape

(167, 1847)

**Final number of samples for all feature tables is 167 total samples**

In [12]:
serum_filt_drop.to_csv('./feature_tables/serum-ft-hashed-matched.tsv',sep='\t')

In [13]:
serum_filt_drop.T.to_csv('./feature_tables/serum-ft-hashed-matched_T.tsv',sep='\t')

## Adding Songbird Train/Test Column To Metadata
Before beginning to work on the machine learning analysis for the serum data, we need to build a Songbird model for the serum data. Adding a Train/Test column to the metadata allows less randomized model building. We are interested in comparing pos and neg for advanced fibrosis, so there will be 10 pos and 10 neg for AF in the train column and the rest of the samples will be test.

In [15]:
meta = pd.read_csv('../metadata-matched.tsv',sep='\t',index_col=0)
meta.head(3)

Unnamed: 0,ATTRIBUTE_AgeInYears,ATTRIBUTE_BarcodeSequence,ATTRIBUTE_BiologicalSex,ATTRIBUTE_ChromatographyAndPhase,ATTRIBUTE_ComorbidityListDOIDIndex,ATTRIBUTE_Country,ATTRIBUTE_DOIDCommonName,ATTRIBUTE_DOIDOntologyIndex,ATTRIBUTE_DepthorAltitudeMeters,ATTRIBUTE_Description,...,ATTRIBUTE_waist_circumference_cm,ATTRIBUTE_water_lot,ATTRIBUTE_wbc,ATTRIBUTE_weight,ATTRIBUTE_weight_units,ATTRIBUTE_well_description,ATTRIBUTE_well_id,ATTRIBUTE_zygocity,ATTRIBUTE_zygoticity,filename
TW-CO-002,54,TCATGGCCTCCG,male,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,98,RNBF9925,8.0,80.6,kg,Loomba_Twin_Plate_2_TW.CO.002_E2,E2,2,2.0,TW-CO-002_RC4_01_35874.mzXML
TW-BT-001,49,CACACAAAGTCA,female,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,92,RNBF9925,8.5,80.8,kg,Loomba_Twin_Plate_2_TW.BT.001_E10,E10,1,1.0,TW-BT-001_RH12_01_35942.mzXML
TW-CA-002,61,TCCGGCGGGCAA,male,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,115,RNBF9925,6.2,99.7,kg,Loomba_Twin_Plate_2_TW.CA.002_B10,B10,1,1.0,TW-CA-002_RH6_01_35936.mzXML


In [21]:
#going through index and creating train/test column
pos_ctr = 0
neg_ctr = 0
train_test = []
for idx in meta.index:
    if meta.loc[idx]['ATTRIBUTE_adv_fibrosis']=='1' and pos_ctr != 10:
        train_test.append('Train')
        pos_ctr = pos_ctr + 1
    elif meta.loc[idx]['ATTRIBUTE_adv_fibrosis']=='0' and neg_ctr != 10:
        train_test.append('Train')
        neg_ctr = neg_ctr + 1
    else:
        train_test.append('Test')

In [22]:
meta_copy = meta.copy()
meta_copy['sb_train_test'] = train_test
meta_copy.head(3)

Unnamed: 0,ATTRIBUTE_AgeInYears,ATTRIBUTE_BarcodeSequence,ATTRIBUTE_BiologicalSex,ATTRIBUTE_ChromatographyAndPhase,ATTRIBUTE_ComorbidityListDOIDIndex,ATTRIBUTE_Country,ATTRIBUTE_DOIDCommonName,ATTRIBUTE_DOIDOntologyIndex,ATTRIBUTE_DepthorAltitudeMeters,ATTRIBUTE_Description,...,ATTRIBUTE_water_lot,ATTRIBUTE_wbc,ATTRIBUTE_weight,ATTRIBUTE_weight_units,ATTRIBUTE_well_description,ATTRIBUTE_well_id,ATTRIBUTE_zygocity,ATTRIBUTE_zygoticity,filename,sb_train_test
TW-CO-002,54,TCATGGCCTCCG,male,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,RNBF9925,8.0,80.6,kg,Loomba_Twin_Plate_2_TW.CO.002_E2,E2,2,2.0,TW-CO-002_RC4_01_35874.mzXML,Train
TW-BT-001,49,CACACAAAGTCA,female,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,RNBF9925,8.5,80.8,kg,Loomba_Twin_Plate_2_TW.BT.001_E10,E10,1,1.0,TW-BT-001_RH12_01_35942.mzXML,Train
TW-CA-002,61,TCCGGCGGGCAA,male,not applicable,not applicable,United States of America,not applicable,not applicable,200,loomba_nobel_liver,...,RNBF9925,6.2,99.7,kg,Loomba_Twin_Plate_2_TW.CA.002_B10,B10,1,1.0,TW-CA-002_RH6_01_35936.mzXML,Train


In [23]:
meta_copy.to_csv('../metadata-matched-sb.tsv',sep='\t')

## Generating Scaled Feature Table
For machine learning analysis, need a scaled table. To do this I will be using sklearn's StandardScaler to generate a serum hashed, matched, scaled table.

In [2]:
serum_table = pd.read_csv('./feature_tables/serum-ft-hashed-matched.tsv',sep='\t',index_col=0)
serum_table.head(3)

Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
TW-CO-002,55809.107,16961.251,76560.9505,8627.2465,9663.3085,173.5485,409293.0,16418.4475,62555.667,0.0,...,0.0,25607.2775,8736.1815,2628.3335,8261.911,115.805,200.1885,1438.5715,32596.2895,0.0
TW-BT-001,141374.9595,37664.9385,51482.278,42985.9195,944092.969,349.6905,3351079.0,29538.136,87325.661,0.0,...,0.0,53155.7155,22630.8375,5028.326,5144.877,219.7935,140.6745,2776.502,212928.942,0.0
TW-CA-002,14916.993,14055.226,40373.087,25469.6935,152745.316,169.015,683127.4,42557.0155,70843.1185,318.784,...,0.0,59003.971,29346.5395,2493.696,1345.321,0.0,2407450.0,7916.8855,22446.481,0.0


In [4]:
#using StandardScaler from sklearn to normalize our data
scaler = StandardScaler()
data = serum_table.values
data_scaled = scaler.fit_transform(data)
table_scaled = pd.DataFrame(data_scaled, columns=serum_table.columns,index=serum_table.index)
print(table_scaled.shape)
table_scaled.head(3)

(167, 1847)


Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
TW-CO-002,1.057551,0.71369,0.101371,0.133053,-0.286215,-0.318617,0.065534,-0.279881,0.117167,-0.301339,...,-0.719733,0.180132,-0.325997,-0.386096,-0.09569,-0.185462,-0.174831,-0.07225,0.065709,-0.985989
TW-BT-001,4.324877,3.21735,-0.021519,3.134937,6.154245,-0.31619,4.300775,0.274048,0.371661,-0.301339,...,-0.719733,1.366283,0.741181,-0.301068,-0.13815,-0.185047,-0.174998,0.048797,7.210811,-0.985989
TW-CA-002,-0.503911,0.362269,-0.075956,1.604561,0.699963,-0.31868,0.459769,0.823721,0.202315,-0.280759,...,-0.719733,1.618091,1.25698,-0.390866,-0.189908,-0.185924,6.574866,0.513861,-0.336444,-0.985989


In [5]:
#importing table back to qiime artifact
qiime_scaled_table = qiime2.Artifact.import_data("FeatureTable[Frequency]", table_scaled)
qiime_scaled_table.save('./feature_tables/serum-ft-hashed-matched-scale.qza')

'./feature_tables/serum-ft-hashed-matched-scale.qza'

In [6]:
#also export the scaled feature table as .tsv
table_scaled.to_csv('./feature_tables/serum-ft-hashed-matched-scaled.tsv',sep='\t')

## Generating CLR transformed Feature Table
When performing machine learning analysis, using a CLR transformed table is usually a good starting point. In the previous analysis using the scaled table compared to the CLR table provided stronger performance but we will create a CLR table to be certain.

In [8]:
serum_table = pd.read_csv('./feature_tables/serum-ft-hashed-matched.tsv',sep='\t',index_col=0)
serum_table.head(3)

Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
TW-CO-002,55809.107,16961.251,76560.9505,8627.2465,9663.3085,173.5485,409293.0,16418.4475,62555.667,0.0,...,0.0,25607.2775,8736.1815,2628.3335,8261.911,115.805,200.1885,1438.5715,32596.2895,0.0
TW-BT-001,141374.9595,37664.9385,51482.278,42985.9195,944092.969,349.6905,3351079.0,29538.136,87325.661,0.0,...,0.0,53155.7155,22630.8375,5028.326,5144.877,219.7935,140.6745,2776.502,212928.942,0.0
TW-CA-002,14916.993,14055.226,40373.087,25469.6935,152745.316,169.015,683127.4,42557.0155,70843.1185,318.784,...,0.0,59003.971,29346.5395,2493.696,1345.321,0.0,2407450.0,7916.8855,22446.481,0.0


In [9]:
data = serum_table.values
data_mr = composition.multiplicative_replacement(data)
data_clr = composition.clr(data_mr)
table_clr = pd.DataFrame(data_clr, columns=serum_table.columns,index=serum_table.index)
print(table_clr.shape)
table_clr.head(3)

(167, 1847)


Unnamed: 0,0004069a1f2c3f2b14511a4f367e485d,0013ada35a72573fde974722d6f46269,0074f0d3d22016bcdf28ee53c8a5062b,012ade16945709e69650957b8fdeb021,015bcc773d2fe77d5ddf3533aff00a9f,0171c4a4201b9604da3efc9305fc5264,017460a73612042af020d72a944567b4,01809bb355414a0015fd764cba9c5014,01a15bb425333e9bb45a57970a4fd744,01f940cc9606a206fc0f5f4be686142a,...,fe8cbb81d1abec83ba4cc2e9f82dd552,febe3e034a4a2cb784dec089610dd34c,fecd9732b400a12637130cb39f076b17,ff0f37b5dc7b583b0990818a3ca3bd85,ff47eed94e12e32213900023846a2b25,fface4fa9e4315823b284edf46814f27,ffc858cfa01fda2a2b2c0a1344a0b87a,ffe5d35f6eb7b6d40c0b901f6576ca3c,ffeb115b9cba8e7fde3b05c27b47a913,fff8ef7ac71077d6ddc2c538ad270385
TW-CO-002,2.419473,1.228468,2.735623,0.552462,0.665872,-3.353762,4.411967,1.195942,2.533593,-4.500389,...,-4.500389,1.640413,0.565009,-0.636114,0.509192,-3.758311,-3.21096,-1.238813,1.881735,-4.500389
TW-BT-001,2.631422,1.308736,1.621244,1.440879,4.530231,-3.3707,5.797044,1.065689,2.149651,-4.641508,...,-4.641508,1.653232,0.79932,-0.704906,-0.681992,-3.83506,-4.2813,-1.298801,3.040965,-4.641508
TW-CA-002,0.633377,0.57387,1.629039,1.168365,2.959648,-3.846892,4.457557,1.681721,2.191344,-3.212365,...,-4.647163,2.008481,1.310051,-1.155358,-1.772491,-4.647163,5.717199,-0.000126,1.04201,-4.647163


In [10]:
#importing table back to qiime artifact
qiime_clr_table = qiime2.Artifact.import_data("FeatureTable[Frequency]", table_clr)
qiime_clr_table.save('./feature_tables/serum-ft-hashed-matched-clr.qza')
#also export the scaled feature table as .tsv
table_clr.to_csv('./feature_tables/serum-ft-hashed-matched-clr.tsv',sep='\t')

## Exporting Songbird Diff

In [12]:
#loading in the serum feature table
sb_diff = qiime2.Artifact.load('songbird_analysis/f_ATTRIBUTE_adv_fibrosis_dp_0.1_lr_0.001_e_5000/differentials.qza')
sb_diffpd = sb_diff.view(pd.DataFrame)
sb_diffpd.head(3)

Unnamed: 0_level_0,Intercept,ATTRIBUTE_adv_fibrosis[T.1],ATTRIBUTE_adv_fibrosis[T.Missing: Not collected]
featureid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0004069a1f2c3f2b14511a4f367e485d,2.434485,0.747122,-0.099946
0013ada35a72573fde974722d6f46269,0.065348,0.681179,5.4e-05
0074f0d3d22016bcdf28ee53c8a5062b,0.971285,-1.627832,5.4e-05


In [14]:
sb_diffpd.to_csv('./songbird_analysis/serum-sb-differentials.tsv',sep='\t')