# Data Collection

In [1]:
import numpy as np
import pandas as pd

In [2]:
meta = pd.read_csv('data/challenge-meta-information.csv')
meta_columns = meta.columns
meta.describe(include='all')

Unnamed: 0,time,GA,SampleID,randPerson,data
count,68.0,68.0,68,68,68
unique,,,68,17,2
top,,,R3,X,train
freq,,,1,4,56
mean,2.5,13.451681,,,
std,1.126347,10.001572,,,
min,1.0,-3.428571,,,
25%,1.75,6.642857,,,
50%,2.5,13.5,,,
75%,3.25,20.25,,,


In [3]:
meta.head()

Unnamed: 0,time,GA,SampleID,randPerson,data
0,1,11.0,K1,K,train
1,1,11.0,J1,J,train
2,1,11.0,P1,P,train
3,1,11.0,L1,L,train
4,1,11.0,H1,H,train


Train data consist of 14 women, while the test data consists of 3 women.

SampleID is unique for all entries, and we'll be using the sample ID as the identifier. 

Each person is identified using the randPerson column. 

## Load the rest of the datasets

In [4]:
immunome = pd.read_csv('data/train/immunome.csv')
immunome_columns = immunome.columns
immunome.head(1)

Unnamed: 0,SampleID,Bcells,CD16+CD56-NKcells,CD4+Tcells_mem,CD4+Tcells_naive,CD4+Tcells,CD45RA+Tregs,CD45RA-Tregs,CD56+CD16-NKcells,CD7+NKcells,...,M-MDSC_STAT5_Unstim,mDCs_STAT5_Unstim,ncMCs_STAT5_Unstim,pDCs_STAT5_Unstim,Tbet+CD4+Tcells_mem_STAT5_Unstim,Tbet+CD4+Tcells_naive_STAT5_Unstim,Tbet+CD8+Tcells_mem_STAT5_Unstim,Tbet+CD8+Tcells_naive_STAT5_Unstim,TCRgd+Tcells_STAT5_Unstim,Tregs_STAT5_Unstim
0,K4,0.053164,0.054978,0.297875,0.136289,0.445832,0.00257,0.013848,0.007052,0.070836,...,0.998954,0.953637,1.082629,0.80861,0.504269,0.757424,0.462045,0.454665,0.443859,0.529431


In [5]:
serumLuminex = pd.read_csv('data/train/serumLuminex.csv')
serumLuminex_columns = serumLuminex.columns
serumLuminex.head(1)

Unnamed: 0,SampleID,serum-IL17F,serum-FASL,serum-TGFA,serum-MIP1A,serum-SDF1A,serum-IL27,serum-LIF,serum-IL1B,serum-IL2,...,serum-ICAM1,serum-VCAM1,serum-FGFB,serum-IL22,serum-PDGFBB,serum-VEGF,serum-LEPTIN,serum-PAI1,serum-CD40L,serum-ENA78
0,K4,84.5,27.0,65.25,46.5,226.0,36.5,52.0,33.5,38.0,...,1679.0,16645.0,50.5,186.25,76.5,149.0,1744.0,7855.25,472.5,539.25


In [6]:
plasmaLuminex = pd.read_csv('data/train/plasmaLuminex.csv')
plasmaLuminex_columns = plasmaLuminex.columns
plasmaLuminex.head(1)

Unnamed: 0,SampleID,plasma-IL17F,plasma-FASL,plasma-TGFA,plasma-MIP1A,plasma-SDF1A,plasma-IL27,plasma-LIF,plasma-IL1B,plasma-IL2,...,plasma-ICAM1,plasma-VCAM1,plasma-FGFB,plasma-IL22,plasma-PDGFBB,plasma-VEGF,plasma-LEPTIN,plasma-PAI1,plasma-CD40L,plasma-ENA78
0,K4,59.0,82.0,27.75,46.0,191.25,70.5,35.5,28.0,31.0,...,1578.5,15781.0,50.5,43.0,62.5,164.0,1533.5,3707.5,115.75,213.5


In [7]:
plasmaSomalogic = pd.read_csv('data/train/plasmaSomalogic.csv')
plasmaSomalogic_columns = plasmaSomalogic.columns
plasmaSomalogic.head(1)

Unnamed: 0,SampleID,STUB1,CEBPB,ENO2,PIAS4,IL10RA,STAT3,IRF1,MCL1,OAS1,...,UBE2G2,TAGLN2,ATP5O,POMC.2,CRYZL1,SERPINF1,CTSF,FTCD,USP25,PLXNB2
0,K1,1084.0,396.2,7065.9,490.7,702.9,761.0,1003.2,443.4,140.7,...,4804.4,2233.0,3610.9,365.4,151.4,37885.8,1479.1,3261.8,561.3,3227.0


# Data Preparation

Merge Immunome, SerumLuminex, plasmaLuminex and plasmaSomalogic data with meta information

In [8]:
#meta, immunome, serumLuminex, plasmaLuminex, plasmaSomalogic
merged_df = meta.merge(immunome, left_on='SampleID',right_on='SampleID', suffixes= ('_meta','_immunome'))

In [9]:
#meta, immunome, serumLuminex, plasmaLuminex, plasmaSomalogic
merged_df = merged_df.merge(serumLuminex, left_on='SampleID',right_on='SampleID', suffixes= ('','_serumLuminex'))

In [10]:
merged_df = merged_df.merge(plasmaLuminex, left_on='SampleID',right_on='SampleID', suffixes= ('','_plasmaLuminex'))

In [11]:
merged_df = merged_df.merge(plasmaSomalogic, left_on='SampleID',right_on='SampleID', suffixes= ('','_plasmaSomalogic'))

In [12]:
merged_df.head()

Unnamed: 0,time,GA,SampleID,randPerson,data,Bcells,CD16+CD56-NKcells,CD4+Tcells_mem,CD4+Tcells_naive,CD4+Tcells,...,UBE2G2,TAGLN2,ATP5O,POMC.2,CRYZL1,SERPINF1,CTSF,FTCD,USP25,PLXNB2
0,1,11.0,K1,K,train,0.052857,0.069794,0.279917,0.14035,0.430839,...,4804.4,2233.0,3610.9,365.4,151.4,37885.8,1479.1,3261.8,561.3,3227.0
1,1,11.0,J1,J,train,0.108153,0.006318,0.211465,0.098454,0.313554,...,4261.9,1804.6,1470.6,410.8,163.0,38938.3,1170.1,1036.8,552.8,3457.1
2,1,11.0,P1,P,train,0.133483,0.045048,0.132603,0.287326,0.428456,...,4017.6,2057.7,2331.3,567.5,178.1,34359.5,1427.1,3149.2,597.0,3370.9
3,1,11.0,L1,L,train,0.100676,0.037142,0.256169,0.117621,0.380101,...,4489.4,2653.4,2189.1,569.8,176.3,35281.1,1944.2,3266.7,615.2,2916.1
4,1,11.0,H1,H,train,0.099235,0.127228,0.193829,0.074824,0.276794,...,4269.4,2446.8,2560.7,501.6,169.3,33864.2,2175.2,948.5,552.4,3841.8


We now need to break the dataset into test and train. We'll use the data column for the same. The filenames would be train_sc01 and test_sc01. 

In [13]:
train_sc01 = merged_df[merged_df['data']=='train']
train_sc01

Unnamed: 0,time,GA,SampleID,randPerson,data,Bcells,CD16+CD56-NKcells,CD4+Tcells_mem,CD4+Tcells_naive,CD4+Tcells,...,UBE2G2,TAGLN2,ATP5O,POMC.2,CRYZL1,SERPINF1,CTSF,FTCD,USP25,PLXNB2
0,1,11.0,K1,K,train,0.052857,0.069794,0.279917,0.14035,0.430839,...,4804.4,2233.0,3610.9,365.4,151.4,37885.8,1479.1,3261.8,561.3,3227.0
1,1,11.0,J1,J,train,0.108153,0.006318,0.211465,0.098454,0.313554,...,4261.9,1804.6,1470.6,410.8,163.0,38938.3,1170.1,1036.8,552.8,3457.1
2,1,11.0,P1,P,train,0.133483,0.045048,0.132603,0.287326,0.428456,...,4017.6,2057.7,2331.3,567.5,178.1,34359.5,1427.1,3149.2,597.0,3370.9
3,1,11.0,L1,L,train,0.100676,0.037142,0.256169,0.117621,0.380101,...,4489.4,2653.4,2189.1,569.8,176.3,35281.1,1944.2,3266.7,615.2,2916.1
4,1,11.0,H1,H,train,0.099235,0.127228,0.193829,0.074824,0.276794,...,4269.4,2446.8,2560.7,501.6,169.3,33864.2,2175.2,948.5,552.4,3841.8
5,1,8.0,G1,G,train,0.170811,0.009526,0.279503,0.113319,0.402652,...,4319.846,2015.892,2432.463,472.327,179.804,37404.643,1708.804,3245.286,691.279,3513.018
6,1,11.0,S1,S,train,0.059995,0.054639,0.217634,0.187234,0.416301,...,3342.3,1838.2,2543.4,645.8,174.4,26741.9,1768.2,658.1,685.9,3359.1
7,1,11.0,O1,O,train,0.054545,0.069564,0.216917,0.18403,0.408359,...,3841.0,1957.7,1343.5,1788.7,162.9,33722.6,1809.0,8026.4,666.2,4300.8
8,1,11.0,R1,R,train,0.094611,0.017908,0.185484,0.261629,0.454951,...,4929.3,2030.7,2140.9,450.7,155.9,44244.4,2526.4,9976.9,712.8,3537.8
9,1,10.0,X1,X,train,0.061253,0.041134,0.236547,0.237067,0.496289,...,3622.7,2625.1,3154.4,425.6,164.2,40142.8,1417.0,2424.4,725.2,3066.3


In [14]:
test_sc01 = merged_df[merged_df['data']=='test']
test_sc01

Unnamed: 0,time,GA,SampleID,randPerson,data,Bcells,CD16+CD56-NKcells,CD4+Tcells_mem,CD4+Tcells_naive,CD4+Tcells,...,UBE2G2,TAGLN2,ATP5O,POMC.2,CRYZL1,SERPINF1,CTSF,FTCD,USP25,PLXNB2
56,1,11.0,I1,I,test,0.119092,0.060294,0.277815,0.153793,0.440762,...,5023.5,2108.8,2679.9,370.1,191.8,32384.0,1557.6,1950.0,701.3,3179.1
57,1,11.0,W1,W,test,0.097618,0.016901,0.193449,0.169328,0.369726,...,5405.0,2012.5,2429.8,445.7,190.9,35255.4,1337.3,1954.9,801.2,4202.6
58,1,11.0,B1,B,test,0.052293,0.010642,0.217094,0.096288,0.32342,...,3178.6,1902.0,2204.7,430.8,173.0,34599.0,1460.3,792.4,663.9,3995.9
59,2,15.0,I2,I,test,0.116981,0.048495,0.241318,0.180908,0.432021,...,5291.9,2005.6,2167.6,378.6,182.9,32540.4,1556.9,3606.4,794.1,3446.4
60,2,18.0,W2,W,test,0.102321,0.014489,0.177417,0.181568,0.366446,...,5548.2,1930.1,1438.7,437.1,186.5,35539.9,1482.3,5026.0,689.4,4143.3
61,2,16.0,B2,B,test,0.039824,0.021531,0.209109,0.118017,0.337753,...,3220.5,2429.8,2628.0,380.3,159.8,37022.8,1597.1,1215.2,612.1,4019.3
62,3,25.0,I3,I,test,0.096353,0.068407,0.257616,0.173699,0.440042,...,4705.3,1978.7,3022.0,372.3,210.9,39139.5,1488.9,7332.9,733.8,5611.4
63,3,27.0,W3,W,test,0.077124,0.014504,0.192619,0.189237,0.389797,...,5879.3,2022.2,1378.3,483.0,202.7,38641.8,1933.5,2582.9,731.4,6205.2
64,3,27.0,B3,B,test,0.028,0.01842,0.222746,0.149671,0.382788,...,3818.8,1824.3,3182.8,369.5,185.9,43374.8,1891.1,4838.8,770.9,7005.5
65,4,2.571429,I4,I,test,0.137333,0.061311,0.274281,0.139593,0.422238,...,3353.9,2562.0,3297.2,350.5,197.7,34493.7,1028.5,2729.4,810.2,2615.5


In [15]:
train_sc01.to_csv ('train_sc01.csv', index = False, header=True)
test_sc01.to_csv ('test_sc01.csv', index = False, header=True)

Merge cell-free RNA, metabolome and microbiome data with meta information. 

In [16]:
cf_rna = pd.read_csv('data/train/cfRNA.csv')
cf_rna_columns = immunome.columns
cf_rna.head(1)

Unnamed: 0,SampleID,C2orf76,ACTL10,CEP135,RP11-613M10.6,NDUFB5P1,MIIP,RP11-98I9.4,C20orf144,RP11-485G7.6,...,LL22NC03-86G7.1,DNAL4,PPM1F,CCDC94,CLN6,RP11-500C11.3,CTBP2P7,ITIH2,RBPMSLP,ABCA8
0,K1,0.312437,-1.89293e-16,28.217854,-5.273559e-18,-6.106227e-18,10.107555,0.070008,-3.885781e-16,-4.113376e-16,...,0.707989,4.504731,55.583978,14.308368,1.93494,-1.826317e-16,-2.797762e-16,-1.122435e-15,0,-1.474376e-15


In [17]:
metabolome = pd.read_csv('data/train/metabolome.csv', encoding = "cp1252")
metabolome_columns = metabolome.columns
metabolome.head(1)

Unnamed: 0,SampleID,N1-Methyl-2-pyridone-5-carboxamide|N1-Methyl-4-pyridone-3-carboxamide|N-[(Aminooxy)Carbonyl]Aniline,Barringtogenol C|Camelliagenin C|Ganoderiol H|ED-71,3beta-Acetoxy-11alpha-methoxy-12-ursen-28-oic acid,Basilimoside,(2s)-Pyrrolidin-2-Ylmethylamine,4-Methylpiperazin-1-Yl Carbonyl Group,Chlorpromazine|Chlorpromazine,"DG(15:0/18:4(6Z,9Z,12Z,15Z)/0:0)|DG(18:4(6Z,9Z,12Z,15Z)/15:0/0:0)|DG(15:0/0:0/18:4n3)","DG(15:0/18:3(6Z,9Z,12Z)/0:0)|DG(15:0/18:3(9Z,12Z,15Z)/0:0)|DG(18:3(6Z,9Z,12Z)/15:0/0:0)|DG(18:3(9Z,12Z,15Z)/15:0/0:0)|DG(15:0/0:0/18:3n6)|DG(15:0/0:0/18:3n3)|DG(14:1n5/0:0/18:2n6)|DG(14:1n5/0:0/20:2n6)",...,"PE(18:4(6Z,9Z,12Z,15Z)/P-18:1(11Z))|PE(18:4(6Z,9Z,12Z,15Z)/P-18:1(9Z))|PE(20:5(5Z,8Z,11Z,14Z,17Z)/P-16:0)|PE(P-16:0/20:5(5Z,8Z,11Z,14Z,17Z))|PE(P-18:1(11Z)/18:4(6Z,9Z,12Z,15Z))|PE(P-18:1(9Z)/18:4(6Z,9Z,12Z,15Z)).2",Glycerol triundecanoate.2,"DG(15:0/18:2(9Z,12Z)/0:0)|DG(18:2(9Z,12Z)/15:0/0:0)|DG(14:0/0:0/18:2n6)|DG(14:0/0:0/20:2n6).5",beta-Casomorphin (1-6),Glycerol triundecanoate.3,DG(15:0/18:1(11Z)/0:0)|DG(15:0/18:1(9Z)/0:0)|DG(18:1(11Z)/15:0/0:0)|DG(18:1(9Z)/15:0/0:0)|DG(15:0/0:0/18:1n7)|DG(15:0/0:0/18:1n9).1,Tetracosyl ferulate|Diisotridecyl phthalate|Ditridecyl phthalate.1,"PE(14:0/20:2(11Z,14Z))|PE(14:1(9Z)/20:1(11Z))|PE(16:0/18:2(9Z,12Z))|PE(16:1(9Z)/18:1(11Z))|PE(16:1(9Z)/18:1(9Z))|PE(18:1(11Z)/16:1(9Z))|PE(18:1(9Z)/16:1(9Z))|PE(18:2(9Z,12Z)/16:0)|PE(20:1(11Z)/14:1(9Z))|PE(20:2(11Z,14Z)/14:0).2",Isatoribine.2,Thelephoric acid|Triphenylantimony
0,K1,-0.103676,-0.107316,-0.114072,-0.123576,-0.13047,-0.129842,0.458028,-0.125868,-0.091913,...,-0.023383,0.058349,-0.110397,-0.129338,0.058349,-0.126018,-0.127577,0.471133,-0.082986,-0.111185


In [18]:
microbiome = pd.read_csv('data/train/microbiome.csv')
microbiome_columns = microbiome.columns
microbiome.head(1)

Unnamed: 0,SampleID,VaginalSwab_Lactobacillus,Saliva_Lactobacillus,ToothGum_Lactobacillus,Stool_Lactobacillus,VaginalSwab_Lactobacillus.1,Saliva_Lactobacillus.1,ToothGum_Lactobacillus.1,Stool_Lactobacillus.1,VaginalSwab_Streptococcus,...,ToothGum_Azospirillum,Stool_Azospirillum,VaginalSwab_Sphingomonas.5,Saliva_Sphingomonas.5,ToothGum_Sphingomonas.5,Stool_Sphingomonas.5,VaginalSwab_Thalassospira.9,Saliva_Thalassospira.9,ToothGum_Thalassospira.9,Stool_Thalassospira.9
0,K1,103.491092,0.435833,0.004819,1.26631,103.491092,0.435833,0.004819,1.26631,-0.004111,...,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197


In [19]:
# cf_rna, metabolome, microbiome
merged_df = meta.merge(cf_rna, left_on='SampleID',right_on='SampleID', suffixes= ('_meta','_immunome'))
print(merged_df.shape)
merged_df = merged_df.merge(metabolome, left_on='SampleID',right_on='SampleID', suffixes= ('','_metabolome'))
merged_df = merged_df.merge(microbiome, left_on='SampleID',right_on='SampleID', suffixes= ('','_microbiome'))

(68, 37280)


In [20]:
train_sc02 = merged_df[merged_df['data']=='train']
train_sc02

Unnamed: 0,time,GA,SampleID,randPerson,data,C2orf76,ACTL10,CEP135,RP11-613M10.6,NDUFB5P1,...,ToothGum_Azospirillum,Stool_Azospirillum,VaginalSwab_Sphingomonas.5,Saliva_Sphingomonas.5,ToothGum_Sphingomonas.5,Stool_Sphingomonas.5,VaginalSwab_Thalassospira.9,Saliva_Thalassospira.9,ToothGum_Thalassospira.9,Stool_Thalassospira.9
0,1,11.0,K1,K,train,0.312437,-1.89293e-16,28.217854,-5.273559e-18,-6.106227e-18,...,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197,-0.02197
1,1,11.0,J1,J,train,5.204209,1.734736,53.776824,0.0,0.0,...,-0.027296,-0.027296,-0.027296,-0.027296,-0.027296,-0.027296,-0.027296,-0.027296,-0.027296,-0.015173
2,1,11.0,P1,P,train,0.0,0.0,15.269087,0.0,0.0,...,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342,-0.032342
3,1,11.0,L1,L,train,0.0,0.0,11.048308,0.0,0.0,...,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554,-0.031554
4,1,11.0,H1,H,train,0.0,8.409156,67.273247,0.0,0.0,...,-0.029365,-0.029365,-0.029365,-0.029365,-0.029365,-0.029365,-0.029365,-0.029365,-0.029365,-0.018315
5,1,8.0,G1,G,train,2.126079,0.0,86.106183,0.0,0.0,...,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475,-0.02475
6,1,11.0,S1,S,train,0.0,0.0,0.0,0.0,0.0,...,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866,-0.041866
7,1,11.0,O1,O,train,0.0,0.0,0.0,0.0,0.0,...,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814,-0.025814
8,1,11.0,R1,R,train,0.0,0.0,0.0,0.0,0.0,...,-0.042781,-0.042781,-0.042781,-0.037289,-0.042781,-0.042781,-0.042781,-0.042781,-0.042781,-0.042781
9,1,10.0,X1,X,train,0.0,0.0,10.638411,0.0,0.0,...,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462,-0.023462


In [21]:
test_sc02 = merged_df[merged_df['data']=='test']
test_sc02

Unnamed: 0,time,GA,SampleID,randPerson,data,C2orf76,ACTL10,CEP135,RP11-613M10.6,NDUFB5P1,...,ToothGum_Azospirillum,Stool_Azospirillum,VaginalSwab_Sphingomonas.5,Saliva_Sphingomonas.5,ToothGum_Sphingomonas.5,Stool_Sphingomonas.5,VaginalSwab_Thalassospira.9,Saliva_Thalassospira.9,ToothGum_Thalassospira.9,Stool_Thalassospira.9
56,1,11.0,I1,I,test,1.849737,0.0,54.567231,0.924868,0.0,...,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,0.204818
57,1,11.0,W1,W,test,1.425523,0.0,44.191208,0.0,0.0,...,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713,-0.026713
58,1,11.0,B1,B,test,0.0,0.0,0.0,0.0,0.0,...,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209,-0.034209
59,2,15.0,I2,I,test,3.864497,0.552071,45.821899,0.0,0.552071,...,-0.026282,-0.026282,-0.026282,-0.026282,-0.026282,-0.026282,-0.026282,-0.026282,-0.026282,0.019252
60,2,18.0,W2,W,test,0.0,0.0,33.506654,0.0,0.0,...,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147,-0.024147
61,2,16.0,B2,B,test,0.0,0.0,34.022863,0.0,0.0,...,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891,-0.034891
62,3,25.0,I3,I,test,0.0,0.0,2.954576,0.0,0.0,...,-0.028956,-0.028956,-0.028956,-0.028956,-0.028956,-0.028956,-0.028956,-0.028956,-0.028956,0.020993
63,3,27.0,W3,W,test,66.465301,0.0,0.0,0.0,0.0,...,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341,-0.0341
64,3,27.0,B3,B,test,0.0,0.0,97.646714,0.0,0.0,...,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123,-0.025123
65,4,2.571429,I4,I,test,3.32343,0.0,13.293718,0.0,0.0,...,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,-0.028496,0.204818


In [22]:
train_sc02.to_csv ('train_sc02.csv', index = False, header=True)
test_sc02.to_csv ('test_sc02.csv', index = False, header=True)

In [24]:
#meta, immunome, serumLuminex, plasmaLuminex, plasmaSomalogic, cfRNA, metabolome, microbolome
merged_df = meta.merge(immunome, left_on='SampleID',right_on='SampleID', suffixes= ('_meta','_immunome'))
print(merged_df.shape)
merged_df = merged_df.merge(serumLuminex, left_on='SampleID',right_on='SampleID', suffixes= ('','_serumLuminex'))
merged_df = merged_df.merge(plasmaLuminex, left_on='SampleID',right_on='SampleID', suffixes= ('','_plasmaLuminex'))
merged_df = merged_df.merge(plasmaSomalogic, left_on='SampleID',right_on='SampleID', suffixes= ('','_plasmaSomalogic'))
merged_df = merged_df.merge(cf_rna, left_on='SampleID',right_on='SampleID', suffixes= ('_meta','_cfRNA'))
merged_df = merged_df.merge(metabolome, left_on='SampleID',right_on='SampleID', suffixes= ('','_metabolome'))
merged_df = merged_df.merge(microbiome, left_on='SampleID',right_on='SampleID', suffixes= ('','_microbiome'))
print(merged_df.shape)

(68, 539)
(68, 61271)


In [26]:
train_sc03 = merged_df[merged_df['data']=='train']
train_sc03.shape

(56, 61271)

In [27]:
test_sc03 = merged_df[merged_df['data']=='test']
test_sc03.shape

(12, 61271)

In [28]:
train_sc03.to_csv ('train_sc03.csv', index = False, header=True)
test_sc03.to_csv ('test_sc03.csv', index = False, header=True)