# R Data Conversion

This Jupyter notebook aggregates and exports the data from [Warnat-Herresthal et al. (2020)](https://github.com/schultzelab/aml_classifer).

Before running the cells in this notebook, the R data must first be obtained from the authors' [Docker image](https://hub.docker.com/r/schultzelab/aml_classifier) by running the following command in a shell:

```
$ docker compose up
```

In [1]:
# Import R data
load(file = 'data/Datasets.RData')
ls()

In [2]:
# Check data dimensions and format
paste('data.1:')
class(data.1)
dim(data.1)
head(data.1)
tail(data.1)
paste('info.1:')
class(info.1)
dim(info.1)
head(info.1)
tail(info.1)
paste('data.2:')
class(data.2)
dim(data.2)
head(data.2)
tail(data.2)
paste('info.2:')
class(info.2)
dim(info.2)
head(info.2)
tail(info.2)

Unnamed: 0,AHW_ALL_BA_025919_13_A_PZ.CEL,AHW_ALL_BA_032751_71_A_PZ.CEL,AHW_ALL_BN_022495_7_A_PZ.CEL,AHW_ALL_MLL_031351_13_A_PZ.CEL,AHW_ALL_MLL_036187_80_A_PZ.CEL,AHW_ALL_Ph_030031_32_A_PZ.CEL,AHW_ALL_Ph_032683_15_A_PZ.CEL,AHW_ALL_Ph_035655_63_A_PZ.CEL,AHW_ALL_TA_025336_26_A_PZ.CEL,AHW_ALL_TA_032725_16_A_PZ.CEL,⋯,GSM924007_ALK_MDS_RAEC_022687_38_A_PZ.CEL,GSM924008_ALK_MDS_RAEC_034886_40_A_PZ.CEL,GSM924009_ALK_MDS_RAEO_001885_44_A_PZ.CEL,GSM924010_ALK_MDS_RAEO_001973_45_A_PZ.CEL,GSM924011_ALK_MDS_RAEO_002508_76_A_PZ.CEL,GSM924013_ALK_MDS_RAEO_031238_53_A_PZ.CEL,GSM924014_ALK_MDS_RAEO_031526_60_A_PZ.CEL,GSM924015_ALK_MDS_RAEO_034350_16_A_PZ.CEL,GSM924016_ALK_MDS_RAEO_035203_14_A_PZ.CEL,GSM924017_ALK_MDS_RAEO_040105_7_A_PZ.CEL
PAX8,9.221573,8.524435,8.60606,8.841501,8.882826,9.011544,8.97559,8.940409,9.342263,8.966156,⋯,8.548096,8.45828,8.687024,8.531356,8.808418,8.703484,9.304286,8.775102,8.625841,8.632727
CCL5,7.990144,6.648112,6.382602,6.469583,8.332415,6.830385,8.753525,6.370298,6.971186,7.302477,⋯,7.218904,8.956285,7.188509,7.502036,7.07774,8.526308,9.071024,7.473359,7.967037,8.72852
MMP14,8.391732,8.183514,7.909023,8.105744,7.820992,7.859977,8.128827,7.956449,8.285775,8.335117,⋯,7.632565,7.855289,7.683088,7.825051,7.808342,7.715343,8.333734,7.902804,7.981485,7.684523
DTX2P1-UPK3BP1-PMS2P11,9.280415,8.798847,8.948909,8.941517,8.725958,8.975915,9.130401,8.997386,9.294638,9.0948,⋯,8.944574,8.732182,9.188813,9.059932,9.182124,8.757411,9.187252,8.861302,8.668579,8.991182
BAD,5.579486,5.934621,5.646312,5.899868,5.651731,5.798776,5.849792,6.00224,5.642667,5.768286,⋯,5.992471,6.113847,6.193126,5.816396,6.021704,5.976942,5.948137,5.90427,5.951656,6.100816
PRPF8,9.587992,10.605028,10.367226,10.493261,10.68444,9.850415,10.06846,10.259652,10.168444,9.143968,⋯,8.591656,8.836844,9.707603,10.027859,9.821617,9.93755,9.299585,10.44894,9.823071,9.656752


Unnamed: 0,AHW_ALL_BA_025919_13_A_PZ.CEL,AHW_ALL_BA_032751_71_A_PZ.CEL,AHW_ALL_BN_022495_7_A_PZ.CEL,AHW_ALL_MLL_031351_13_A_PZ.CEL,AHW_ALL_MLL_036187_80_A_PZ.CEL,AHW_ALL_Ph_030031_32_A_PZ.CEL,AHW_ALL_Ph_032683_15_A_PZ.CEL,AHW_ALL_Ph_035655_63_A_PZ.CEL,AHW_ALL_TA_025336_26_A_PZ.CEL,AHW_ALL_TA_032725_16_A_PZ.CEL,⋯,GSM924007_ALK_MDS_RAEC_022687_38_A_PZ.CEL,GSM924008_ALK_MDS_RAEC_034886_40_A_PZ.CEL,GSM924009_ALK_MDS_RAEO_001885_44_A_PZ.CEL,GSM924010_ALK_MDS_RAEO_001973_45_A_PZ.CEL,GSM924011_ALK_MDS_RAEO_002508_76_A_PZ.CEL,GSM924013_ALK_MDS_RAEO_031238_53_A_PZ.CEL,GSM924014_ALK_MDS_RAEO_031526_60_A_PZ.CEL,GSM924015_ALK_MDS_RAEO_034350_16_A_PZ.CEL,GSM924016_ALK_MDS_RAEO_035203_14_A_PZ.CEL,GSM924017_ALK_MDS_RAEO_040105_7_A_PZ.CEL
MEX3D,5.404848,5.65071,5.385639,5.696314,5.459417,5.00272,6.176596,5.347448,4.969231,6.102246,⋯,5.046778,5.087647,5.118147,5.037594,4.821375,5.370264,4.779831,5.182291,5.030552,4.999076
BCAN,7.871873,7.179527,7.652625,7.594891,7.480463,7.565395,7.552387,7.528189,7.80886,7.794971,⋯,7.70597,7.323551,7.616174,7.537318,7.713409,7.368321,8.080974,7.345759,7.636976,7.53817
ACTB,11.430549,10.928578,11.297186,12.156462,10.509088,11.036812,11.328231,12.141119,10.309743,10.069362,⋯,11.8115,10.581782,12.103968,12.128609,12.924348,10.306471,9.570415,10.426717,10.530491,10.448178
GAPDH,11.725073,11.671908,11.879521,12.175835,11.038747,10.754323,10.747527,12.636451,11.299086,11.478279,⋯,12.450878,10.794069,12.555838,11.483291,12.551613,10.856334,10.095893,10.853914,10.176301,11.612054
MIR3648-2,6.373859,6.57264,11.165859,9.5518,11.295602,5.917758,10.840717,7.58458,5.936155,8.95261,⋯,9.506061,11.51936,11.091616,10.891672,10.307151,9.795573,11.212355,6.843315,7.3157,9.114403
MIR3648-1,6.08667,7.823704,10.734375,9.627878,10.229529,7.607608,10.177838,7.386139,7.122855,9.345836,⋯,10.132054,11.931534,10.723389,11.639366,10.289545,10.530758,11.660112,7.078248,8.553492,9.284127


Unnamed: 0_level_0,Dataset,GSE,Condition,Disease,Tissue,FAB,Filename,FAB_all
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,<chr>
AHW_ALL_BA_025919_13_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BA_025919_13_A_PZ.CEL,unknown
AHW_ALL_BA_032751_71_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BA_032751_71_A_PZ.CEL,unknown
AHW_ALL_BN_022495_7_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BN_022495_7_A_PZ.CEL,unknown
AHW_ALL_MLL_031351_13_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_MLL_031351_13_A_PZ.CEL,unknown
AHW_ALL_MLL_036187_80_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_MLL_036187_80_A_PZ.CEL,unknown
AHW_ALL_Ph_030031_32_A_PZ.CEL,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_Ph_030031_32_A_PZ.CEL,unknown


Unnamed: 0_level_0,Dataset,GSE,Condition,Disease,Tissue,FAB,Filename,FAB_all
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,<chr>
GSM924011_ALK_MDS_RAEO_002508_76_A_PZ.CEL,1,GSE37642,CASE,AML,BM,unknown,GSM924011_ALK_MDS_RAEO_002508_76_A_PZ.CEL,unknown
GSM924013_ALK_MDS_RAEO_031238_53_A_PZ.CEL,1,GSE37642,CASE,AML,BM,M1,GSM924013_ALK_MDS_RAEO_031238_53_A_PZ.CEL,M1
GSM924014_ALK_MDS_RAEO_031526_60_A_PZ.CEL,1,GSE37642,CASE,AML,BM,unknown,GSM924014_ALK_MDS_RAEO_031526_60_A_PZ.CEL,unknown
GSM924015_ALK_MDS_RAEO_034350_16_A_PZ.CEL,1,GSE37642,CASE,AML,BM,M6,GSM924015_ALK_MDS_RAEO_034350_16_A_PZ.CEL,M6
GSM924016_ALK_MDS_RAEO_035203_14_A_PZ.CEL,1,GSE37642,CASE,AML,BM,unknown,GSM924016_ALK_MDS_RAEO_035203_14_A_PZ.CEL,unknown
GSM924017_ALK_MDS_RAEO_040105_7_A_PZ.CEL,1,GSE37642,CASE,AML,BM,unknown,GSM924017_ALK_MDS_RAEO_040105_7_A_PZ.CEL,unknown


Unnamed: 0,GSM1030962_10030.CEL,GSM1030963_10110.CEL,GSM1030964_10111.CEL,GSM1030965_10138.CEL,GSM1030966_1032.CEL,GSM1030967_1113.CEL,GSM1030968_1132.CEL,GSM1030969_1446.CEL,GSM1030970_1509.CEL,GSM1030971_1524.CEL,⋯,GSM924570_LfL2004_424.CEL,GSM924571_LfL2004_426.CEL,GSM924572_LfL2004_427.CEL,GSM924573_LfL2004_508.CEL,GSM924574_LfL2004_521.CEL,GSM924575_LfL2004_529.CEL,GSM924576_LfL2004_543.CEL,GSM924577_LfL2004_544.CEL,GSM924578_LfL2005_105.CEL,GSM924579_LfL2005_106.CEL
PAX8,8.27154,8.49112,8.34082,8.2419,7.75355,7.95457,7.99231,8.40749,8.11578,8.18438,⋯,7.10595,7.9131,7.35846,7.58868,7.20894,7.15379,7.27836,7.90197,7.7771,8.07077
CCL5,5.02307,3.95575,4.68666,5.78429,4.97396,5.75343,4.21497,4.55103,9.11314,7.51884,⋯,9.73433,8.19256,7.94408,8.31446,9.78055,7.8129,8.4313,9.00389,10.7725,9.60457
MMP14,7.0664,6.41582,6.14716,6.65615,6.35464,6.89049,6.28378,6.63393,6.55267,6.79243,⋯,5.54289,6.59639,6.09989,6.11176,5.52058,5.88418,5.67004,6.28643,6.34328,6.20558
DTX2P1-UPK3BP1-PMS2P11,8.2499,8.42649,8.09809,8.15517,8.0068,7.95516,8.13866,8.56563,7.87479,7.76304,⋯,7.57276,8.10639,7.87547,7.92185,7.71894,7.49404,7.70163,7.98879,7.91903,7.63185
BAD,5.75432,6.77953,6.35168,4.97367,6.79648,5.49744,7.27109,6.34759,5.37534,4.98609,⋯,4.90018,4.58021,5.49209,6.12062,5.9637,6.41037,6.16504,5.43941,6.17747,6.94213
PRPF8,9.40864,9.99876,10.1085,9.31322,9.89784,9.68242,9.99797,10.6624,9.52855,9.17292,⋯,9.06694,9.23565,7.94853,8.88408,8.9596,9.83892,9.5642,9.55271,8.55952,9.68685


Unnamed: 0,GSM1030962_10030.CEL,GSM1030963_10110.CEL,GSM1030964_10111.CEL,GSM1030965_10138.CEL,GSM1030966_1032.CEL,GSM1030967_1113.CEL,GSM1030968_1132.CEL,GSM1030969_1446.CEL,GSM1030970_1509.CEL,GSM1030971_1524.CEL,⋯,GSM924570_LfL2004_424.CEL,GSM924571_LfL2004_426.CEL,GSM924572_LfL2004_427.CEL,GSM924573_LfL2004_508.CEL,GSM924574_LfL2004_521.CEL,GSM924575_LfL2004_529.CEL,GSM924576_LfL2004_543.CEL,GSM924577_LfL2004_544.CEL,GSM924578_LfL2005_105.CEL,GSM924579_LfL2005_106.CEL
MEX3D,3.94199,4.04885,4.14032,4.00719,3.80263,3.56352,3.78868,5.51037,4.23656,3.97889,⋯,4.25928,3.88914,3.84209,3.99162,4.08153,5.80256,3.98321,3.72613,3.87147,5.31675
BCAN,7.37819,7.57217,7.05551,7.5341,6.97885,7.09779,7.31212,7.17282,7.71819,7.10354,⋯,6.25435,7.29905,6.93559,7.26595,6.66113,6.49141,6.52711,6.80543,6.89953,6.39693
ACTB,12.3146,12.9723,12.5884,11.1369,13.0438,9.69503,11.7524,12.9234,11.1284,11.2523,⋯,11.5283,10.4697,11.8547,12.1764,12.2472,12.2692,12.6222,11.8529,10.3006,12.8578
GAPDH,12.1923,13.1223,13.1523,11.8223,13.5033,11.5631,13.0203,12.8722,12.0715,12.0485,⋯,10.8026,10.4898,11.7143,12.8235,12.2272,12.3318,13.1032,12.4263,12.0726,13.267
MIR3648-2,5.14609,6.94468,6.75634,4.16825,5.19974,4.36405,6.89726,5.75249,4.81597,3.9571,⋯,6.76329,6.77307,7.01761,8.40861,6.85906,5.34932,6.0231,8.15306,13.5911,5.86773
MIR3648-1,4.51915,6.19253,6.37648,4.38011,5.15342,3.47498,6.89337,5.46476,4.44112,4.28932,⋯,6.36579,6.55005,6.99596,8.01971,6.78035,6.17282,6.47611,8.11232,10.5068,6.23468


Unnamed: 0_level_0,Dataset,GSE,Condition,Disease,Tissue,FAB,Filename,FAB_all
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,<chr>
GSM1030962_10030.CEL,2,GSE42038,CONTROL,ALL,BM,unknown,GSM1030962_10030.CEL,unknown
GSM1030963_10110.CEL,2,GSE42038,CONTROL,ALL,PBMC,unknown,GSM1030963_10110.CEL,unknown
GSM1030964_10111.CEL,2,GSE42038,CONTROL,ALL,BM,unknown,GSM1030964_10111.CEL,unknown
GSM1030965_10138.CEL,2,GSE42038,CONTROL,ALL,BM,unknown,GSM1030965_10138.CEL,unknown
GSM1030966_1032.CEL,2,GSE42038,CONTROL,ALL,BM,unknown,GSM1030966_1032.CEL,unknown
GSM1030967_1113.CEL,2,GSE42038,CONTROL,ALL,BM,unknown,GSM1030967_1113.CEL,unknown


Unnamed: 0_level_0,Dataset,GSE,Condition,Disease,Tissue,FAB,Filename,FAB_all
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,<chr>
GSM924574_LfL2004_521.CEL,2,GSE37642,CASE,AML,BM,M4,GSM924574_LfL2004_521.CEL,M4
GSM924575_LfL2004_529.CEL,2,GSE37642,CASE,AML,BM,M1,GSM924575_LfL2004_529.CEL,M1
GSM924576_LfL2004_543.CEL,2,GSE37642,CASE,AML,BM,M2,GSM924576_LfL2004_543.CEL,M2
GSM924577_LfL2004_544.CEL,2,GSE37642,CASE,AML,BM,M1,GSM924577_LfL2004_544.CEL,M1
GSM924578_LfL2005_105.CEL,2,GSE37642,CASE,AML,BM,M0,GSM924578_LfL2005_105.CEL,M0
GSM924579_LfL2005_106.CEL,2,GSE37642,CASE,AML,BM,M5,GSM924579_LfL2005_106.CEL,M5


In [3]:
# Aggregate data
df1 = as.data.frame(t(data.1))
df1 = merge(df1, info.1, by = 'row.names', all = T)
dim(df1)
df2 = as.data.frame(t(data.2))
df2 = merge(df2, info.2, by = 'row.names', all = T)
dim(df2)
df = rbind(df1, df2)
dim(df)
head(df)

Unnamed: 0_level_0,Row.names,PAX8,CCL5,MMP14,DTX2P1-UPK3BP1-PMS2P11,BAD,PRPF8,CAPNS1,RPL35,EIF4G2,⋯,MIR3648-2,MIR3648-1,Dataset,GSE,Condition,Disease,Tissue,FAB,Filename,FAB_all
Unnamed: 0_level_1,<I<chr>>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,<chr>
1,AHW_ALL_BA_025919_13_A_PZ.CEL,9.221573,7.990144,8.391732,9.280415,5.579486,9.587992,10.143771,11.36457,10.95629,⋯,6.373859,6.08667,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BA_025919_13_A_PZ.CEL,unknown
2,AHW_ALL_BA_032751_71_A_PZ.CEL,8.524435,6.648112,8.183514,8.798847,5.934621,10.605028,10.613034,11.66462,11.63216,⋯,6.57264,7.823704,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BA_032751_71_A_PZ.CEL,unknown
3,AHW_ALL_BN_022495_7_A_PZ.CEL,8.60606,6.382602,7.909023,8.948909,5.646312,10.367226,9.836321,12.08132,11.69096,⋯,11.165859,10.734375,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_BN_022495_7_A_PZ.CEL,unknown
4,AHW_ALL_MLL_031351_13_A_PZ.CEL,8.841501,6.469583,8.105744,8.941517,5.899868,10.493261,10.701126,12.02987,11.3287,⋯,9.5518,9.627878,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_MLL_031351_13_A_PZ.CEL,unknown
5,AHW_ALL_MLL_036187_80_A_PZ.CEL,8.882826,8.332415,7.820992,8.725958,5.651731,10.68444,10.52436,11.85613,11.38861,⋯,11.295602,10.229529,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_MLL_036187_80_A_PZ.CEL,unknown
6,AHW_ALL_Ph_030031_32_A_PZ.CEL,9.011544,6.830385,7.859977,8.975915,5.798776,9.850415,10.115055,11.90956,11.19729,⋯,5.917758,7.607608,1,Haferlach,CONTROL,ALL,BM or PBMC,unknown,AHW_ALL_Ph_030031_32_A_PZ.CEL,unknown


In [4]:
# Export to CSV
write.csv(df, 'data/leukemia.csv', row.names = F)