# Initialization

In [1]:
import polars as pl

from adex.helpers import load_data_per_condition, gene_intersection
from adex.models import Condition, ConditionSequencingTissueDataLoader, SequencingTechnique, TissueEnum
from adex.helpers import high_frequency_genes_dataframe, common_genes_dataframe, get_pre_processed_dataset
from adex.models import ConditionDataLoader

files_path = "../data/adex-database/samples"
metadata_path = "../data/adex-database/metadata.csv"
datasets_info_path = "../data/adex-database/datasets_info.csv"

ra_data  = load_data_per_condition(Condition.RA, files_path)
t1d_data = load_data_per_condition(Condition.T1D, files_path)
ssc_data = load_data_per_condition(Condition.SSc, files_path)
sle_data = load_data_per_condition(Condition.SLE, files_path)
sjs_data = load_data_per_condition(Condition.SjS, files_path)

# Explore Metadata

In [2]:
metadata_df = pl.read_csv("../data/adex-database/metadata.csv")
metadata_df.shape

(3980, 10)

In [3]:
metadata_df

Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity
str,str,str,str,str,str,str,str,str,str
"""GSM260887""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260890""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260892""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260895""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260898""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260901""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260904""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260907""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260910""","""GSE10325""","""Expression""","""GPL96""","""Healthy""","""Peripheral blo…","""B cells""",,,
"""GSM260915""","""GSE10325""","""Expression""","""GPL96""","""SLE""","""Peripheral blo…","""B cells""",,,


In [4]:
total_samples = metadata_df.select("Sample").shape[0]
unique_samples = metadata_df.select("Sample").unique().shape[0]

conditions_df = metadata_df.select("Condition").unique()
conditions_count = conditions_df.shape[0]
conditions = conditions_df.to_series().to_list()

tissues_df = metadata_df.select("Tissue").unique()
tissues_count = tissues_df.shape[0]
tissues = tissues_df.to_series().to_list()

gpl_df = metadata_df.select("GPL").unique()
gpl_count = gpl_df.shape[0]
gpls = gpl_df.to_series().to_list()

gse_df = metadata_df.select("GSE").unique()
gse_count = gse_df.shape[0]
gses = gse_df.to_series().to_list()

males = metadata_df.select("Gender").filter(pl.col("Gender") == "Male").shape[0]
females = metadata_df.select("Gender").filter(pl.col("Gender") == "Female").shape[0]
gender_unknown = metadata_df.select("Gender").filter((pl.col("Gender") != "Female") & (pl.col("Gender") != "Male")).shape[0]

strategy_df = metadata_df.select("Experimental Strategy").unique()

print(f"""
Total Samples = {total_samples}
Unique Samples = {unique_samples}
Conditions = {conditions_count} [{",".join(conditions)}]
GPL = {gpl_count} [{",".join(gpls)}]
GSE = {gse_count} [{",".join(gses)}]
Tissues = {tissues_count} [{",".join(tissues)}]
Males = {males}, Females = {females}, Unknown Gender = {gender_unknown}
Experimental Strategy = {strategy_df.to_series().to_list()}
""")


Total Samples = 3980
Unique Samples = 3854
Conditions = 6 [RA,SjS,T1D,SLE,SSc,Healthy]
GPL = 13 [GPL97,GPL14951_GPL13534,GPL13158,GPL11154,GPL96,GPL20301,GPL6884,GPL570,GPL15456,GPL10558_GPL13534,GPL10558,GPL13667,GPL16791]
GSE = 41 [GSE89408,GSE56649,GSE51092,GSE110174,GSE55098,GSE124073,GSE55235,GSE7451,GSE50772,GSE13887,GSE63903,GSE77298,GSE65010,GSE12021,GSE11907,GSE40611,GSE38351,GSE60424,GSE10325,GSE82221,GSE72509,GSE80183,GSE24706,GSE65391,GSE110914,GSE23117,GSE55457,GSE90081,GSE110169,GSE45291,GSE117931,GSE112341,GSE108497,GSE30153,GSE124939,GSE104174,GSE57383,GSE61635,GSE93683,GSE95065,GSE84844]
Tissues = 7 [Parotid gland,Peripheral blood,Salivary gland,Skin,Saliva,Whole blood,Synovial membrane]
Males = 269, Females = 1547, Unknown Gender = 0
Experimental Strategy = ['Expression', 'Expression + Methylation']


In [5]:
metadata_df.group_by("Condition").agg(pl.struct(["Sample"]).n_unique().alias('SamplesPerCondition')).sort("SamplesPerCondition", descending=True)

Condition,SamplesPerCondition
str,u32
"""SLE""",1502
"""Healthy""",957
"""RA""",862
"""SjS""",281
"""SSc""",157
"""T1D""",95


In [6]:
metadata_df.group_by("Age").agg(pl.struct(["Sample"]).n_unique().alias('SamplesPerAge')).sort("Age")


Age,SamplesPerAge
str,u32
,2400
"""1-10""",113
"""11-20""",342
"""21-30""",383
"""31-40""",331
"""41-50""",105
"""51-60""",66
"""61-70""",69
"""71-80""",39
""">80""",6


# Check Metadata for duplicates

Duplicates will create issues later in the process if there is a join with this file. 

In [7]:
metadata_df.group_by("Sample").len().filter(pl.col("len") > 1).sort("len", descending=True).head()

Sample,len
str,u32
"""GSM940478""",2
"""GSM2981155""",2
"""GSM2981203""",2
"""GSM940473""",2
"""GSM2981058""",2


In [9]:
# Check removing duplicates 
metadata_df.unique(subset=["Sample"]).group_by("Sample").len().sort("len", descending=True).head()

Sample,len
str,u32
"""GSM2902068""",1
"""GSM1863695""",1
"""GSM1101281""",1
"""GSM2902264""",1
"""GSM301752""",1


# Explore Files that seem problematic

### GSE95065

There seem to be two files that need to be merged into one 


In [7]:
GSE95065_large = pl.read_parquet("../data/adex-database/samples/archived_not_used/GSE95065.tsv.parquet")
GSE95065_large

gene,GSM2495594,GSM2495595,GSM2495596,GSM2495597,GSM2495598,GSM2495599,GSM2495600,GSM2495601,GSM2495602,GSM2495603,GSM2495604,GSM2495605,GSM2495606,GSM2495607,GSM2495608,GSM2495609,GSM2495610,GSM2495611,GSM2495612,GSM2495613,GSM2495614,GSM2495615,GSM2495616,GSM2495617,GSM2495618,GSM2495619,GSM2495620,GSM2495621,GSM2495622,GSM2495623,GSM2495624,GSM2495625,GSM2495626
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""RFC2""",7.472319,7.520892,7.46077,7.708487,7.484799,7.299081,7.403904,7.419463,7.45063,7.61232,7.46854,7.532882,7.411745,7.435213,7.392205,7.532592,7.403685,7.449712,7.52772,7.240532,7.633632,7.264277,7.582619,7.21455,7.082178,7.386463,7.320884,7.227947,7.456815,7.379748,7.278312,7.269157,7.1024
"""HSPA6""",6.655682,6.248444,6.563427,6.74514,6.030863,5.127361,6.477777,5.914106,6.815282,6.247377,6.455072,6.769109,5.906113,6.368727,6.104465,6.300024,6.50896,6.381587,6.293698,6.213825,6.221177,7.188617,5.930373,6.517476,5.987333,6.650793,6.977275,6.185656,7.468024,6.119981,6.749708,6.33245,6.380119
"""PAX8""",6.772054,6.926438,7.034588,6.975327,7.220449,6.916914,6.659487,6.965389,6.622051,7.281831,7.033667,6.778211,7.172063,7.045818,7.006107,6.899453,7.061638,7.335806,6.919635,6.931886,6.94088,6.986259,7.180362,6.525082,7.144469,6.556937,6.600135,7.149994,6.662156,7.145639,7.119249,7.113005,6.452059
"""THRA""",6.90694,6.48762,6.229516,6.134647,6.217019,6.197873,5.779039,6.464217,6.489212,6.001884,6.219632,6.309929,5.870903,5.843094,6.005228,6.168541,5.932801,6.147186,5.98024,5.860218,5.608119,5.963343,6.404039,6.674311,6.161022,6.13027,6.000803,5.985383,5.773169,5.903811,6.093964,5.95448,6.172006
"""PTPN21""",6.99916,7.081638,7.270245,6.609682,7.529765,6.601726,6.198417,7.849579,6.695222,6.546318,6.706809,7.358162,7.494944,7.113105,6.591536,7.221406,6.52857,6.701473,7.611369,7.117925,6.774631,6.386196,7.258027,6.129345,6.430386,6.517319,6.941436,6.722773,6.443518,7.459997,7.135372,7.249807,7.095118
"""CCL5""",7.07975,6.092999,5.735654,6.728099,5.811003,4.609014,8.603649,6.084578,7.141292,5.4493,5.653901,7.069109,5.446654,6.361946,4.8171,5.031157,4.938001,5.160383,6.333225,4.035719,4.729835,6.90157,5.817311,6.325495,5.580433,5.583964,8.38015,6.218533,6.087392,8.100331,5.367073,5.649418,5.039789
"""CYP2E1""",6.196167,4.492376,6.459971,6.627319,6.291382,5.109184,5.991173,4.886798,5.518165,5.869055,6.285918,6.374689,6.242608,7.109768,6.597216,4.919737,6.845075,5.972443,6.209479,6.62767,5.960389,4.929532,5.697879,5.336928,6.289446,4.41045,6.09204,5.686044,5.875945,5.935592,6.07799,6.337083,5.81603
"""EPHB3""",6.958701,6.994674,7.523542,7.152963,7.115558,7.486647,6.428781,7.50271,6.905031,7.73239,7.597305,7.363381,7.893089,7.599454,8.193947,7.752947,7.931709,7.831751,7.553464,7.979561,7.743131,7.468185,7.82446,7.405587,7.720182,7.507602,7.167658,7.26325,7.153739,7.480215,7.481708,7.64475,7.53078
"""ESRRA""",6.80847,6.797286,6.954059,7.001706,6.857725,7.050968,6.814291,6.938178,6.786193,7.206581,7.022166,6.916465,7.041363,7.029736,7.339206,6.988665,7.099916,7.382145,6.832798,7.048717,7.14127,6.830311,7.200086,6.789154,7.324337,6.704587,6.626802,6.917285,6.828058,6.940019,6.931476,7.230816,7.018688
"""GAS6""",9.598344,9.975522,10.043358,9.968912,9.87949,9.540436,10.094454,9.71956,10.151767,9.801949,9.588306,9.721379,9.970734,9.573153,9.719227,9.843187,9.528572,9.904768,9.916989,10.180935,9.559053,9.901122,9.73748,10.235113,9.698278,10.099996,9.70151,10.168666,9.455319,9.541577,9.562279,9.665531,10.100651


In [10]:
GSE95065_small = pl.read_parquet("../data/adex-database/samples/archived_not_used/GSE95065_2.tsv.parquet")
GSE95065_small

gene,GSM2495594,GSM2495595,GSM2495596,GSM2495597,GSM2495598,GSM2495599,GSM2495600,GSM2495601,GSM2495602,GSM2495603,GSM2495604,GSM2495605,GSM2495606,GSM2495607,GSM2495608,GSM2495609,GSM2495610,GSM2495611,GSM2495612,GSM2495613,GSM2495614,GSM2495615,GSM2495616,GSM2495617,GSM2495618,GSM2495619,GSM2495620,GSM2495621,GSM2495622,GSM2495623,GSM2495624,GSM2495625,GSM2495626
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""CEBPE""",7.430844,7.424512,7.380993,7.495515,7.424263,7.24761,7.358499,7.352183,7.373819,7.574839,7.341107,7.483675,7.384654,7.317921,7.326927,7.497311,7.343509,7.384384,7.439529,7.171636,7.541954,7.175672,7.52662,7.103011,7.094538,7.276562,7.277203,7.154872,7.350882,7.365737,7.076975,7.140006,7.11732
"""ADCYAP1R1""",5.94925,5.912904,5.952583,6.136321,5.660838,4.507445,5.995402,5.392767,6.156154,5.868257,5.963205,6.043992,5.441751,5.91522,5.651909,5.814725,6.128093,5.859722,5.756369,5.602133,5.668191,6.470656,5.523302,5.818856,5.381958,5.952577,6.267259,5.528522,6.896724,5.5735,6.109171,5.765169,5.834804
"""COL7A1""",6.533028,6.441243,6.26865,6.402277,6.406926,6.124735,6.913235,6.183706,6.143744,6.092093,6.323905,6.448042,6.687022,6.449526,6.58893,6.058564,6.496673,6.376913,6.206048,6.574422,6.464814,6.64254,6.144032,6.473642,6.567916,6.610596,6.442491,6.794229,6.435983,6.277183,6.302032,6.382896,6.893414
"""KLF6""",6.90694,6.48762,6.229516,6.134647,6.217019,6.197873,5.779039,6.464217,6.489212,6.001884,6.226221,6.309929,5.870903,5.843094,5.924252,6.168541,6.366553,6.147186,5.792084,6.085894,5.959874,5.963343,6.404039,6.674311,6.161022,6.13027,6.000803,5.985383,5.773169,5.903811,6.093964,5.95448,6.172006
"""CSF2RA""",5.482237,5.547813,6.167729,5.801479,5.7241,6.074025,5.030489,5.985279,5.512681,7.077504,6.963825,5.931991,7.247194,7.027666,7.610069,6.462201,7.18134,7.206354,6.125501,7.399473,7.054716,6.050846,7.134491,6.054555,7.198266,6.117478,5.848947,5.850248,5.67489,5.840013,6.061546,6.261048,6.09999
"""CTBP1""",7.096109,7.070467,7.344943,7.401909,7.352289,7.293874,6.940814,7.268688,7.075635,7.617213,7.378066,7.332296,7.393262,7.464938,7.756849,7.379056,7.383231,7.848183,7.146071,7.513861,7.4345,7.320879,7.648898,7.124524,7.703255,7.059668,6.957033,7.352952,7.183082,7.296272,7.286571,7.572689,7.418754
"""DIAPH1""",8.14523,8.312656,7.992397,8.272858,8.040403,7.828828,8.157008,8.061688,8.004696,7.385955,7.887343,8.037436,8.048556,7.902219,7.907266,8.116818,7.760369,7.803573,8.014856,7.750942,7.780054,8.198553,8.027947,7.939627,7.718404,8.141551,8.181171,8.108116,8.095633,7.884621,7.977461,7.938129,8.151805
"""AGER""",6.496517,6.508898,6.538657,6.435531,6.282903,5.896218,6.146781,6.536278,6.593639,6.228704,6.463922,6.752157,6.412501,6.361711,6.306303,6.793278,6.331338,6.189678,6.208045,6.186679,6.070997,6.663842,6.381444,6.595394,5.902664,6.686002,6.480337,5.997034,6.535427,6.661929,6.405839,6.227995,6.342333
"""DNASE1""",4.872944,4.847915,5.046468,5.162639,5.052784,4.968124,4.846196,4.997902,4.970702,6.169151,6.330242,4.943527,6.44802,6.408799,6.298719,5.029668,6.074697,6.764309,4.955496,6.080928,6.642391,4.991812,6.655077,4.973225,6.002208,4.765483,4.947005,4.872625,5.144602,5.031794,5.089739,4.942311,5.133462
"""TOR1A""",6.121443,6.058982,6.344896,6.602623,6.311452,5.929669,5.899855,6.332064,5.975099,6.775418,7.148073,6.148816,7.113099,6.806075,6.828931,6.210185,6.552282,7.2246,6.190214,6.813814,6.719083,6.139074,6.79052,5.978004,6.786083,6.091691,6.073293,6.067115,6.010002,6.16686,6.022811,6.093656,6.122676


In [20]:
# Make certain that we have the same samples in the two files 
large_set_samples = set(GSE95065_large.columns)
small_set_samples = set(GSE95065_small.columns)
large_set_samples == small_set_samples

True

In [23]:
# Make certain that there are no duplicated genes between the two files 
large_set_genes = set(GSE95065_large.select("gene").to_series().to_list())
small_set_genes = set(GSE95065_small.select("gene").to_series().to_list())
large_set_genes.intersection(small_set_genes)

{'ASAP3',
 'CAPN1',
 'CKLF',
 'COL7A1',
 'CPA4',
 'CSF2RA',
 'CTBP1',
 'DIAPH1',
 'ELAC2',
 'FAM171A1',
 'GLTP',
 'HIF1AN',
 'IFT57',
 'IPO9',
 'KLF6',
 'LEF1',
 'LUC7L',
 'PBK',
 'PXDC1',
 'SEMA4G',
 'SLC52A1',
 'TMEM168',
 'TOR1A',
 'TTC12',
 'TUBB',
 'UNC5B',
 'WDYHV1',
 'ZC3HAV1',
 'ZCCHC24'}

There seem to be common genes between the two files which does not make much sense since it would be different readings for the same sample and gene. 
Will discard the small file instead of the old decision that was keeping it.
Commenting the lines below that were creating the merged dataset.  

In [13]:
# GSE95065_merged = pl.concat([GSE95065_small, GSE95065_large], rechunk=True)
# GSE95065_merged

gene,GSM2495594,GSM2495595,GSM2495596,GSM2495597,GSM2495598,GSM2495599,GSM2495600,GSM2495601,GSM2495602,GSM2495603,GSM2495604,GSM2495605,GSM2495606,GSM2495607,GSM2495608,GSM2495609,GSM2495610,GSM2495611,GSM2495612,GSM2495613,GSM2495614,GSM2495615,GSM2495616,GSM2495617,GSM2495618,GSM2495619,GSM2495620,GSM2495621,GSM2495622,GSM2495623,GSM2495624,GSM2495625,GSM2495626
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""CEBPE""",7.430844,7.424512,7.380993,7.495515,7.424263,7.24761,7.358499,7.352183,7.373819,7.574839,7.341107,7.483675,7.384654,7.317921,7.326927,7.497311,7.343509,7.384384,7.439529,7.171636,7.541954,7.175672,7.52662,7.103011,7.094538,7.276562,7.277203,7.154872,7.350882,7.365737,7.076975,7.140006,7.11732
"""ADCYAP1R1""",5.94925,5.912904,5.952583,6.136321,5.660838,4.507445,5.995402,5.392767,6.156154,5.868257,5.963205,6.043992,5.441751,5.91522,5.651909,5.814725,6.128093,5.859722,5.756369,5.602133,5.668191,6.470656,5.523302,5.818856,5.381958,5.952577,6.267259,5.528522,6.896724,5.5735,6.109171,5.765169,5.834804
"""COL7A1""",6.533028,6.441243,6.26865,6.402277,6.406926,6.124735,6.913235,6.183706,6.143744,6.092093,6.323905,6.448042,6.687022,6.449526,6.58893,6.058564,6.496673,6.376913,6.206048,6.574422,6.464814,6.64254,6.144032,6.473642,6.567916,6.610596,6.442491,6.794229,6.435983,6.277183,6.302032,6.382896,6.893414
"""KLF6""",6.90694,6.48762,6.229516,6.134647,6.217019,6.197873,5.779039,6.464217,6.489212,6.001884,6.226221,6.309929,5.870903,5.843094,5.924252,6.168541,6.366553,6.147186,5.792084,6.085894,5.959874,5.963343,6.404039,6.674311,6.161022,6.13027,6.000803,5.985383,5.773169,5.903811,6.093964,5.95448,6.172006
"""CSF2RA""",5.482237,5.547813,6.167729,5.801479,5.7241,6.074025,5.030489,5.985279,5.512681,7.077504,6.963825,5.931991,7.247194,7.027666,7.610069,6.462201,7.18134,7.206354,6.125501,7.399473,7.054716,6.050846,7.134491,6.054555,7.198266,6.117478,5.848947,5.850248,5.67489,5.840013,6.061546,6.261048,6.09999
"""CTBP1""",7.096109,7.070467,7.344943,7.401909,7.352289,7.293874,6.940814,7.268688,7.075635,7.617213,7.378066,7.332296,7.393262,7.464938,7.756849,7.379056,7.383231,7.848183,7.146071,7.513861,7.4345,7.320879,7.648898,7.124524,7.703255,7.059668,6.957033,7.352952,7.183082,7.296272,7.286571,7.572689,7.418754
"""DIAPH1""",8.14523,8.312656,7.992397,8.272858,8.040403,7.828828,8.157008,8.061688,8.004696,7.385955,7.887343,8.037436,8.048556,7.902219,7.907266,8.116818,7.760369,7.803573,8.014856,7.750942,7.780054,8.198553,8.027947,7.939627,7.718404,8.141551,8.181171,8.108116,8.095633,7.884621,7.977461,7.938129,8.151805
"""AGER""",6.496517,6.508898,6.538657,6.435531,6.282903,5.896218,6.146781,6.536278,6.593639,6.228704,6.463922,6.752157,6.412501,6.361711,6.306303,6.793278,6.331338,6.189678,6.208045,6.186679,6.070997,6.663842,6.381444,6.595394,5.902664,6.686002,6.480337,5.997034,6.535427,6.661929,6.405839,6.227995,6.342333
"""DNASE1""",4.872944,4.847915,5.046468,5.162639,5.052784,4.968124,4.846196,4.997902,4.970702,6.169151,6.330242,4.943527,6.44802,6.408799,6.298719,5.029668,6.074697,6.764309,4.955496,6.080928,6.642391,4.991812,6.655077,4.973225,6.002208,4.765483,4.947005,4.872625,5.144602,5.031794,5.089739,4.942311,5.133462
"""TOR1A""",6.121443,6.058982,6.344896,6.602623,6.311452,5.929669,5.899855,6.332064,5.975099,6.775418,7.148073,6.148816,7.113099,6.806075,6.828931,6.210185,6.552282,7.2246,6.190214,6.813814,6.719083,6.139074,6.79052,5.978004,6.786083,6.091691,6.073293,6.067115,6.010002,6.16686,6.022811,6.093656,6.122676


In [15]:
# GSE95065_merged.write_parquet(file="../data/adex-database/samples/SSc/GSE95065_merged.tsv.parquet", use_pyarrow=True)

In [4]:
# validate 
# pl.read_parquet("../data/adex-database/samples/SSc/GSE95065_merged.tsv.parquet")

gene,GSM2495594,GSM2495595,GSM2495596,GSM2495597,GSM2495598,GSM2495599,GSM2495600,GSM2495601,GSM2495602,GSM2495603,GSM2495604,GSM2495605,GSM2495606,GSM2495607,GSM2495608,GSM2495609,GSM2495610,GSM2495611,GSM2495612,GSM2495613,GSM2495614,GSM2495615,GSM2495616,GSM2495617,GSM2495618,GSM2495619,GSM2495620,GSM2495621,GSM2495622,GSM2495623,GSM2495624,GSM2495625,GSM2495626
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""CEBPE""",7.430844,7.424512,7.380993,7.495515,7.424263,7.24761,7.358499,7.352183,7.373819,7.574839,7.341107,7.483675,7.384654,7.317921,7.326927,7.497311,7.343509,7.384384,7.439529,7.171636,7.541954,7.175672,7.52662,7.103011,7.094538,7.276562,7.277203,7.154872,7.350882,7.365737,7.076975,7.140006,7.11732
"""ADCYAP1R1""",5.94925,5.912904,5.952583,6.136321,5.660838,4.507445,5.995402,5.392767,6.156154,5.868257,5.963205,6.043992,5.441751,5.91522,5.651909,5.814725,6.128093,5.859722,5.756369,5.602133,5.668191,6.470656,5.523302,5.818856,5.381958,5.952577,6.267259,5.528522,6.896724,5.5735,6.109171,5.765169,5.834804
"""COL7A1""",6.533028,6.441243,6.26865,6.402277,6.406926,6.124735,6.913235,6.183706,6.143744,6.092093,6.323905,6.448042,6.687022,6.449526,6.58893,6.058564,6.496673,6.376913,6.206048,6.574422,6.464814,6.64254,6.144032,6.473642,6.567916,6.610596,6.442491,6.794229,6.435983,6.277183,6.302032,6.382896,6.893414
"""KLF6""",6.90694,6.48762,6.229516,6.134647,6.217019,6.197873,5.779039,6.464217,6.489212,6.001884,6.226221,6.309929,5.870903,5.843094,5.924252,6.168541,6.366553,6.147186,5.792084,6.085894,5.959874,5.963343,6.404039,6.674311,6.161022,6.13027,6.000803,5.985383,5.773169,5.903811,6.093964,5.95448,6.172006
"""CSF2RA""",5.482237,5.547813,6.167729,5.801479,5.7241,6.074025,5.030489,5.985279,5.512681,7.077504,6.963825,5.931991,7.247194,7.027666,7.610069,6.462201,7.18134,7.206354,6.125501,7.399473,7.054716,6.050846,7.134491,6.054555,7.198266,6.117478,5.848947,5.850248,5.67489,5.840013,6.061546,6.261048,6.09999
"""CTBP1""",7.096109,7.070467,7.344943,7.401909,7.352289,7.293874,6.940814,7.268688,7.075635,7.617213,7.378066,7.332296,7.393262,7.464938,7.756849,7.379056,7.383231,7.848183,7.146071,7.513861,7.4345,7.320879,7.648898,7.124524,7.703255,7.059668,6.957033,7.352952,7.183082,7.296272,7.286571,7.572689,7.418754
"""DIAPH1""",8.14523,8.312656,7.992397,8.272858,8.040403,7.828828,8.157008,8.061688,8.004696,7.385955,7.887343,8.037436,8.048556,7.902219,7.907266,8.116818,7.760369,7.803573,8.014856,7.750942,7.780054,8.198553,8.027947,7.939627,7.718404,8.141551,8.181171,8.108116,8.095633,7.884621,7.977461,7.938129,8.151805
"""AGER""",6.496517,6.508898,6.538657,6.435531,6.282903,5.896218,6.146781,6.536278,6.593639,6.228704,6.463922,6.752157,6.412501,6.361711,6.306303,6.793278,6.331338,6.189678,6.208045,6.186679,6.070997,6.663842,6.381444,6.595394,5.902664,6.686002,6.480337,5.997034,6.535427,6.661929,6.405839,6.227995,6.342333
"""DNASE1""",4.872944,4.847915,5.046468,5.162639,5.052784,4.968124,4.846196,4.997902,4.970702,6.169151,6.330242,4.943527,6.44802,6.408799,6.298719,5.029668,6.074697,6.764309,4.955496,6.080928,6.642391,4.991812,6.655077,4.973225,6.002208,4.765483,4.947005,4.872625,5.144602,5.031794,5.089739,4.942311,5.133462
"""TOR1A""",6.121443,6.058982,6.344896,6.602623,6.311452,5.929669,5.899855,6.332064,5.975099,6.775418,7.148073,6.148816,7.113099,6.806075,6.828931,6.210185,6.552282,7.2246,6.190214,6.813814,6.719083,6.139074,6.79052,5.978004,6.786083,6.091691,6.073293,6.067115,6.010002,6.16686,6.022811,6.093656,6.122676


### GSE55098

The second file seems to only have 5 samples. Easier to just discard it. This whole dataset is too small anyway. 

In [3]:
GSE55098_one = pl.read_parquet("../data/adex-database/samples/GSE55098.tsv.parquet")
GSE55098_two = pl.read_parquet("../data/adex-database/samples/GSE55098_2.tsv.parquet")

In [4]:
GSE55098_one

gene,GSM1329616,GSM1329617,GSM1329618,GSM1329619,GSM1329620,GSM1329621,GSM1329622,GSM1329623,GSM1329624,GSM1329625,GSM1329626,GSM1329627,GSM1329628,GSM1329629,GSM1329630,GSM1329631,GSM1329632,GSM1329633,GSM1329634,GSM1329635,GSM1329636,GSM1329637
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""RFC2""",7.29312,7.478815,7.163901,7.19965,7.201116,7.000996,7.605478,7.084733,7.274616,7.623437,7.695536,7.628405,7.628396,7.376713,7.502768,7.783859,7.16911,7.128845,7.331175,7.392728,7.518944,7.957457
"""HSPA6""",8.646147,9.134429,7.398783,7.617116,8.546272,8.837147,7.904668,9.397619,8.595635,9.133088,9.955381,8.704306,9.249976,8.592631,8.990663,9.506727,8.826066,10.290093,8.566601,10.476805,8.968847,9.544259
"""PAX8""",4.955674,4.633809,4.779538,4.769782,5.189039,4.969808,5.008537,5.134186,5.157049,4.728086,5.346386,4.71582,4.804079,5.088407,5.040982,4.851637,4.964125,5.303836,4.845228,4.768793,4.982103,5.013744
"""GUCA1A""",4.566657,4.454742,4.474081,4.542757,4.448803,4.056051,4.630339,4.546007,4.39032,4.147015,4.410766,4.453085,4.434004,4.438081,4.488301,4.478591,4.473089,4.383461,4.466288,4.467433,4.790837,4.408654
"""THRA""",5.744708,5.691902,5.289194,5.564487,5.790481,5.947639,5.609385,5.785431,5.950154,5.576511,5.39011,6.25262,5.490247,5.58748,5.606161,5.543975,5.974371,5.8293,6.018458,5.316998,5.420747,5.404102
"""PTPN21""",3.814043,3.634538,3.75437,4.113093,3.846746,3.789836,3.69543,3.964155,3.947626,3.931912,4.303271,4.125382,4.002965,4.243301,4.006246,4.026725,3.859124,4.029854,3.833951,3.973107,3.881679,3.737949
"""CCL5""",13.212147,13.164316,12.342226,12.964645,13.270208,13.030897,13.628946,13.347582,12.718301,13.200598,11.686295,12.914284,12.833934,12.759561,12.773942,12.006663,12.629261,12.564126,12.049704,12.413792,12.310641,12.695485
"""CYP2E1""",4.264835,4.350058,4.279817,4.497782,4.224286,4.247999,4.378694,4.466011,4.180108,4.334538,4.267884,4.295343,4.096116,4.578787,4.088221,4.24095,4.37154,4.109611,4.11845,4.183501,4.150002,4.532212
"""EPHB3""",5.521879,5.418001,5.315546,5.550489,5.666342,5.401517,5.435024,5.579085,5.631932,5.334814,6.090561,5.428605,5.513184,5.924601,5.569855,5.799532,5.658137,6.12622,5.126061,5.530971,5.530454,5.541074
"""ESRRA""",8.094513,7.87557,7.825837,7.932706,8.143573,7.908348,7.910776,8.310848,8.157666,7.79066,8.156676,7.763692,7.916473,8.305422,7.957523,8.055218,8.141473,8.212826,7.820936,7.888282,7.955044,8.058493


In [5]:
GSE55098_two

gene,GSM1329617,GSM1329622,GSM1329627,GSM1329632,GSM1329637
str,f64,f64,f64,f64,f64
"""RFC2""",7.737012,7.836727,7.794087,7.432892,8.124732
"""HSPA6""",9.286261,8.037874,8.922395,8.983894,9.658149
"""PAX8""",4.728237,5.001011,4.880428,4.856147,4.956531
"""GUCA1A""",4.445629,4.519743,4.519743,4.524788,4.519743
"""THRA""",5.96914,5.784274,6.317857,6.025319,5.573626
"""PTPN21""",3.674374,3.846698,3.886408,3.886408,3.886408
"""CCL5""",13.148091,13.637502,12.916867,12.593212,12.699659
"""CYP2E1""",4.549542,4.557339,4.464209,4.559979,4.650939
"""EPHB3""",5.426595,5.335683,5.426595,5.499359,5.465223
"""ESRRA""",8.117016,8.069888,7.824364,8.130327,8.145904


# Determine common genes per condition

The datasets for each condition come from different sources. 
To use all the samples of a condition together, we have to see which are the common genes.

In [10]:
ra_common_genes = gene_intersection(ra_data)
t1d_common_genes = gene_intersection(t1d_data)
ssc_common_genes = gene_intersection(ssc_data)
sle_common_genes = gene_intersection(sle_data)
sjs_common_genes = gene_intersection(sjs_data)

print(f"""
    Common Genes:  
    
    RA: {len(ra_common_genes)}
    T1D: {len(t1d_common_genes)}
    SSc: {len(ssc_common_genes)}
    SLE: {len(sle_common_genes)}
    SjS: {len(sjs_common_genes)}
""")


    Common Genes:  
    
    RA: 2602
    T1D: 2504
    SSc: 6582
    SLE: 2050
    SjS: 7683


In [11]:
ra_cleaned_df = common_genes_dataframe(ra_data)
t1d_cleaned_df = common_genes_dataframe(t1d_data)
ssc_cleaned_df = common_genes_dataframe(ssc_data)
sle_cleaned_df = common_genes_dataframe(sle_data)
sjs_cleaned_df = common_genes_dataframe(sjs_data)

print(f"""
    Shape (including only common genes): 
    
    RA: {ra_cleaned_df.shape}
    T1D: {t1d_cleaned_df.shape}
    SSc: {ssc_cleaned_df.shape}
    SLE: {sle_cleaned_df.shape}
    SjS: {sjs_cleaned_df.shape}
""")



    Shape (including only common genes): 
    
    RA: (2602, 1123)
    T1D: (2504, 177)
    SSc: (6582, 230)
    SLE: (2050, 2054)
    SjS: (7683, 401)


In [12]:
ra_cleaned_df

gene,GSM2397368,GSM2397369,GSM2397370,GSM2397371,GSM2397372,GSM2397373,GSM2397374,GSM2397375,GSM2397376,GSM2397377,GSM2397378,GSM2397379,GSM2397380,GSM2397381,GSM2397382,GSM2397383,GSM2397384,GSM2397385,GSM2397386,GSM2397387,GSM2397388,GSM2397389,GSM2397390,GSM2397391,GSM940464,GSM940465,GSM940466,GSM940467,GSM940468,GSM940469,GSM940470,GSM940471,GSM940472,GSM940473,GSM940474,GSM940475,…,GSM2981270,GSM2981271,GSM2981273,GSM2981274,GSM2981275,GSM2981277,GSM2981278,GSM2981282,GSM2981284,GSM2981285,GSM2981287,GSM2981288,GSM2981292,GSM2981299,GSM1337304,GSM1337305,GSM1337306,GSM1337307,GSM1337308,GSM1337309,GSM1337310,GSM1337311,GSM1337312,GSM1337313,GSM1337314,GSM1337315,GSM1337316,GSM1337317,GSM1337318,GSM1337319,GSM1337320,GSM1337321,GSM1337322,GSM1337323,GSM1337324,GSM1337325,GSM1337326
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""PRPF8""",10.300174,10.035596,9.946043,9.754634,9.548489,10.454243,10.422701,10.466462,10.416991,10.269101,10.612026,10.432169,9.842217,10.598199,10.527711,10.446505,10.275231,10.303516,10.38749,9.981355,10.248172,10.222183,10.127072,10.109676,10.526841,10.42496,10.264395,10.059819,10.117711,10.260505,10.054314,10.008526,10.546508,10.054489,9.877455,10.23425,…,7.119925,7.882404,7.301386,7.44894,7.511872,6.967906,7.246508,7.3634,7.217893,6.834777,7.061777,7.446133,7.080927,6.934464,10.160033,10.176519,10.342486,10.043809,10.224774,10.281766,10.439942,10.568807,10.433375,10.323734,10.307302,10.5377,10.414889,10.362321,10.83282,10.207693,10.780509,10.533173,10.453166,10.431685,10.290099,10.265424,10.20004
"""CAPNS1""",10.798511,10.615747,10.336886,10.826442,10.006123,10.589632,11.054638,11.031265,11.001215,11.037475,11.048611,11.171153,10.602799,11.299048,11.129337,10.60026,10.783247,10.942931,10.374393,10.97013,10.516086,11.131564,11.341242,10.356657,12.520915,12.526763,12.514427,12.46295,12.598133,12.619007,12.417359,12.270408,12.392827,12.552039,12.661467,12.532013,…,9.598248,10.306487,9.81535,9.87711,10.033491,9.606278,9.306791,10.156029,9.738847,9.302198,9.392385,9.941688,10.051251,9.177352,12.181145,12.523872,12.090685,12.01898,12.264899,12.510742,12.640521,12.670836,12.542486,12.337569,12.349254,12.443957,12.136351,12.23419,12.518501,12.498801,12.293609,12.442576,12.344191,12.097445,12.521696,12.38552,12.347517
"""RPL35""",9.626852,9.906102,9.301532,9.480503,9.220034,9.430944,9.658827,9.151758,9.545525,8.675993,8.941811,9.16816,9.998175,9.165326,9.39243,9.398868,8.567068,9.719124,7.98824,9.128049,9.692231,10.226649,10.243644,8.760298,12.291249,12.603199,12.487063,12.708696,12.332445,12.546499,12.245904,12.309157,12.778988,12.545242,12.512011,12.398779,…,10.159542,10.425637,10.358654,9.926475,10.391847,9.641357,10.863536,9.9132,9.654068,10.248569,10.609523,10.430188,9.670378,10.386452,12.872269,12.784121,13.475936,12.596575,12.725428,12.964772,12.439125,12.365391,12.16772,12.452918,12.541509,13.012694,12.903703,12.760299,12.835132,12.714942,13.017329,12.83234,12.645567,12.842306,12.761665,12.597323,12.039211
"""EIF4G2""",10.906887,11.034909,11.479914,10.822965,11.240902,11.124181,10.867936,10.813486,10.870958,10.997624,10.778594,10.618579,11.078633,10.451386,10.761373,11.121743,10.993374,10.995757,11.7326,11.416099,10.938314,10.927226,10.939713,11.756742,8.856647,8.978837,8.974795,8.97402,8.828827,8.895059,8.846426,8.926981,8.980289,8.92339,8.815299,9.033444,…,10.27754,10.487264,10.434842,10.411115,10.49043,10.362595,10.400059,10.519746,10.417034,10.354741,10.187258,10.323499,10.423264,10.367952,8.891035,8.731318,8.803197,9.004742,8.977229,8.796911,8.945796,8.784818,8.957387,8.834992,8.651417,8.749622,8.820685,8.797222,8.662574,8.697811,8.694772,8.734672,8.878856,8.729406,8.821131,8.823763,8.914625
"""EIF3D""",10.012536,10.060014,9.672631,9.932947,9.755336,9.988787,9.898523,9.872938,10.117764,9.904473,9.950294,9.997951,9.924814,10.279242,10.026822,9.985138,9.712185,10.047398,9.44174,9.846407,10.033734,10.034972,10.256607,9.566959,10.791821,11.08345,10.846693,11.377584,11.0261,11.105787,10.978005,11.039116,11.174745,10.975038,11.218587,11.078547,…,8.55959,9.685738,9.453625,9.302068,9.167857,9.040265,8.834986,9.386986,9.194231,8.834719,8.813779,9.462258,9.232393,8.849218,11.399661,11.55592,11.576193,11.435128,11.095736,11.582461,11.505535,11.358083,11.319633,11.370875,11.118983,11.506252,11.378489,11.36639,11.51997,11.490385,11.798981,11.69424,11.544409,11.175461,11.318008,11.130285,11.049415
"""PARK7""",8.85143,8.786299,8.57732,9.003627,8.297613,8.464054,8.745617,8.811398,8.981532,8.343865,8.802272,8.752875,8.876582,8.981856,8.865111,8.481434,8.555783,8.867129,8.11962,8.993999,8.765954,9.064063,9.297216,8.151186,12.046291,11.864447,11.945154,12.095167,11.745806,11.874272,11.859361,11.74176,11.963679,11.719312,12.073048,11.934192,…,9.096651,9.32621,9.36752,9.429297,9.383394,9.487606,9.16719,9.507132,8.906874,9.211261,9.385746,9.657476,9.625323,8.990624,12.229251,12.179475,12.649769,12.091069,12.183379,12.269187,12.122227,11.931608,12.011842,11.919578,12.232339,12.301144,12.480563,12.437889,12.352416,12.293238,12.524465,12.332405,12.214804,12.272112,12.384915,12.232339,11.855027
"""SRP14""",8.69211,8.60685,9.048156,8.695237,8.567081,8.208721,8.511745,8.389417,8.599481,7.994985,8.19122,8.291987,9.143423,8.08203,8.42594,8.426995,8.092118,8.876174,8.406378,8.928691,8.396433,9.113133,9.176903,8.755166,12.240869,12.351838,12.174052,12.300465,12.277837,12.223097,12.042883,12.063994,12.224776,12.143667,12.229909,12.221393,…,9.956946,9.707894,9.889438,9.835597,9.695191,9.267774,10.245553,9.546475,9.233547,10.013431,9.964908,9.822841,9.689907,10.096109,12.044708,12.365502,12.080553,11.945214,12.044708,11.980851,12.086691,11.946552,11.94325,11.945004,11.79429,12.239176,12.044708,11.98261,12.004288,12.097725,11.998379,12.060758,12.075946,11.860915,12.240921,11.95315,12.00175
"""GDI2""",9.719654,9.97872,10.248648,9.79068,9.957437,9.737455,9.654324,9.715187,9.675309,9.893657,9.913467,9.576051,9.961961,9.38235,9.584085,9.728028,9.694285,9.984663,10.071992,10.179648,9.877713,9.969862,10.13692,10.292576,11.553411,11.670403,11.593286,12.002958,11.750887,11.656724,11.036002,11.206752,11.709237,11.476786,11.556055,11.736227,…,9.486755,9.628617,9.834869,9.806216,9.904071,9.749044,9.639119,9.958226,9.701109,9.811951,9.70284,9.653827,9.736155,9.600132,11.5803,11.555362,11.643939,11.454903,11.526532,11.74651,11.51739,11.396804,11.218069,11.545416,11.551904,11.629795,11.468176,11.485629,11.824352,11.765904,11.84613,12.014944,11.731434,11.693267,11.735996,11.693198,11.652111
"""RPL11""",10.401248,10.504498,10.397176,10.370343,10.056174,10.800289,10.637548,10.179824,10.536041,10.000852,10.187513,9.983243,11.173004,9.90344,10.49582,10.271078,9.493041,10.787019,9.200629,10.286474,10.350378,11.376818,11.203428,9.42554,13.273766,13.166214,13.19703,13.209769,12.973446,13.040458,13.117403,13.016013,13.324036,13.058688,13.255725,13.278277,…,10.812888,11.219187,11.20299,10.727971,11.092273,10.50693,11.281495,10.65244,10.602707,11.305551,11.526104,11.039519,10.97389,11.036077,13.062139,13.275264,13.625257,12.883479,12.935967,13.14794,12.795815,13.028481,12.851096,12.890598,12.591412,13.150466,12.832518,12.865604,13.113446,12.737766,13.015187,12.77205,12.766829,12.936842,13.085008,12.832331,12.599019
"""ARF3""",10.626713,10.124766,10.073174,10.3627,10.102642,10.285412,10.664667,10.572896,10.583315,10.618567,10.545751,10.683704,10.430637,10.506613,10.678565,10.446505,10.935773,10.506458,11.04198,10.868442,10.28888,10.437837,10.815563,10.348162,10.666029,10.306862,10.607278,10.490324,10.780977,10.543412,10.502342,10.542252,10.513902,10.427317,10.414394,10.295966,…,9.75628,10.160415,9.848731,10.055628,9.898772,10.264194,9.405399,9.973769,10.058273,9.611752,9.531126,9.827528,10.164994,9.540439,10.539795,10.812486,10.264111,10.35838,10.296497,10.503848,10.772021,10.67565,10.589931,10.664448,10.982541,10.115003,10.510399,10.933369,10.159218,10.309276,10.368843,10.70756,10.279289,9.833679,10.118108,9.958111,10.683074


In [13]:
total_samples = ra_cleaned_df.shape[1] + t1d_cleaned_df.shape[1] + ssc_cleaned_df.shape[1] + sle_cleaned_df.shape[1] + sjs_cleaned_df.shape[1]
total_samples

3985

Samples seem to be a bit more than the unique ones that appear in the metadata file which is `3854`.
Possibly we need to filter out the extras because for these we have no metadata.  

# Determine frequent genes per group of datasets

The datasets for each condition come from different sources. 
Explore using samples of a condition together, but keeping only the genes that appear in 80% of the samples. 
This is the same logic as the above, but relaxes the constraint and keeping the high-frequency genes instead of just the common ones. 
So it allows for a certain percent of null values.  

In [14]:
# Experiment 
filtered = (
    high_frequency_genes_dataframe(
        dataframes=load_data_per_condition(Condition.SLE, "../data/adex-database/samples"),
        allowed_null_percentage=1,  # will filter later, no filtering here
        drop_frequencies_column=False
    )
    .with_columns(
        pl.sum_horizontal(pl.all().is_null() / pl.all().count())
        .alias("Null-Percentage")
    )
    .filter(pl.col("Null-Percentage") <= 0.01)
)

filtered

gene,GSM2159829_GSM2186668,GSM2159830_GSM2186669,GSM2159831_GSM2186670,GSM2159832_GSM2186671,GSM2159833_GSM2186672,GSM2159834_GSM2186673,GSM2159835_GSM2186674,GSM2159836_GSM2186675,GSM2159837_GSM2186676,GSM2159838_GSM2186677,GSM2159839_GSM2186678,GSM2159840_GSM2186679,GSM2159841_GSM2186680,GSM2159842_GSM2186681,GSM2159843_GSM2186682,GSM2159844_GSM2186683,GSM2159845_GSM2186684,GSM2159846_GSM2186685,GSM2159847_GSM2186686,GSM2159848_GSM2186687,GSM2159849_GSM2186688,GSM2159850_GSM2186689,GSM2159851_GSM2186690,GSM2159852_GSM2186691,GSM2159853_GSM2186692,GSM2159854_GSM2186693,GSM2159855_GSM2186694,GSM2159856_GSM2186695,GSM2159857_GSM2186696,GSM2159858_GSM2186697,GSM2159859_GSM2186698,GSM2159860_GSM2186699,GSM2159861_GSM2186700,GSM2159862_GSM2186701,GSM2159863_GSM2186702,GSM2159864_GSM2186703,…,GSM1863714,GSM1863715,GSM1863716,GSM1863717,GSM1863718,GSM1863719,GSM1863720,GSM1863721,GSM1863722,GSM1863723,GSM1863724,GSM1863725,GSM1863726,GSM1863727,GSM1863728,GSM1863729,GSM1863730,GSM1863731,GSM1863732,GSM1863733,GSM1863734,GSM1863735,GSM1863736,GSM1863737,GSM1863738,GSM1863739,GSM1863740,GSM1863741,GSM1863742,GSM1863743,GSM1863744,GSM1863745,GSM1863746,GSM1863747,GSM1863748,GSM1863749,Null-Percentage
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""EEF1A1""",14.727188,14.795797,14.817517,14.761388,14.747556,14.802662,14.772612,14.817517,14.57622,14.206841,14.825847,14.752155,14.825847,14.825847,14.763379,14.740996,14.825847,14.719846,14.775031,14.729078,14.802662,14.731354,14.769717,14.780421,14.817517,14.745334,14.706219,14.825847,14.621576,14.780941,14.73751,14.698936,14.718909,14.785099,14.786732,14.825847,…,16.545027,17.820075,18.493469,19.329275,18.632252,18.347851,18.349918,18.629831,18.358292,18.022207,18.144812,17.562118,19.170342,18.67002,17.344387,18.655748,18.426595,18.30768,18.340836,18.602275,18.011078,17.815928,17.729573,18.011895,18.307461,18.200168,18.686206,18.273625,18.626904,18.144917,17.896148,18.058938,17.940948,17.825161,17.79348,18.881821,0.0
"""GAPDH""",11.708472,11.65505,12.340323,11.941374,11.870529,12.131457,12.000567,12.299219,12.514883,12.106213,11.916356,12.748576,12.146813,12.20529,12.11357,12.038266,11.873617,12.499638,12.203603,13.301918,12.27173,12.332191,12.352086,12.142406,11.986138,12.923782,12.838461,12.760554,13.111994,12.82303,12.662301,12.556676,12.627911,12.466696,12.263751,12.532706,…,16.888712,15.930099,15.45689,15.527558,15.638986,15.710878,15.638994,16.084079,15.949591,16.806161,16.73309,16.381736,16.590709,15.765959,16.493879,16.080627,16.313399,16.451434,16.016775,15.810099,16.23323,16.339804,16.135183,16.195359,16.318268,16.161138,16.028579,16.129284,15.78345,16.532804,15.99334,16.475451,15.910279,16.026341,16.415398,15.953825,0.0
"""RPS28""",12.334125,12.684471,12.404128,12.224485,12.733298,12.91151,12.646244,12.87063,13.375317,13.530421,13.190688,13.205547,13.171742,13.257166,13.277356,13.080459,13.190688,12.926514,13.317718,12.727908,13.517933,13.305823,13.257166,12.946158,13.465968,12.684471,12.730683,13.446004,13.012509,12.670374,13.313472,13.26406,13.183479,13.313472,13.0869,13.324486,…,12.683078,13.490543,13.4597,13.131236,13.639205,13.771978,13.663596,13.498154,12.943121,13.525818,13.341848,12.876988,13.591069,13.002251,13.073573,13.402047,13.245743,13.884617,13.423776,13.703595,13.545636,13.785906,13.457111,13.955813,13.916188,13.757117,13.563497,14.160874,13.89219,13.431731,13.988091,13.998939,13.9012,13.947265,13.731703,14.274837,0.005341
"""LPP""",10.270518,10.157473,9.924002,10.261215,9.889741,9.69115,10.381211,9.688544,10.905979,10.712206,10.187569,10.034461,10.071536,10.108694,10.276722,10.376196,10.389154,10.063151,10.436256,9.424904,9.724364,10.197448,9.944325,10.315013,10.12907,9.359814,9.160111,9.944919,10.002705,9.718596,9.759793,10.136964,10.113799,10.286929,10.153587,9.66128,…,12.126212,12.051303,11.627417,11.61385,11.855932,11.756698,11.645619,11.616652,12.455816,11.736708,11.547894,12.087966,11.96243,11.80319,11.94949,11.564793,11.832042,11.132743,11.824847,11.448698,11.960393,11.754716,11.351935,11.657794,11.544498,11.775555,11.643457,11.11855,11.353353,11.580846,11.250777,11.404802,11.585813,11.650111,11.814043,11.186168,0.0
"""HNRNPAB""",9.428744,9.626779,9.24285,9.433464,10.176767,9.773827,9.520845,10.020666,9.121851,9.188771,9.310849,9.297978,9.326892,9.666602,9.645398,9.665245,9.62976,9.892971,9.739031,10.036795,9.591478,9.130558,9.471012,9.356092,9.185511,9.800695,9.93442,9.292868,10.117626,9.879411,9.555725,9.868713,9.632567,9.932392,8.987525,9.740472,…,11.127357,11.477925,11.305248,11.255968,11.110342,10.945784,11.179983,11.421711,11.279876,11.757476,11.571169,11.576563,11.06747,11.234738,11.693711,11.588477,11.52488,11.634679,11.377567,11.43241,11.511677,11.602576,11.478515,11.89774,11.609385,11.586355,11.713901,11.821577,11.664829,11.680946,12.044327,11.815843,11.536858,11.662398,11.68726,11.419868,0.005341
"""SERTAD2""",10.878395,10.857395,10.912532,10.91132,10.809722,10.986331,10.488425,10.539112,10.488425,11.004702,10.575976,10.213475,10.258917,10.577801,10.437753,10.552431,10.572944,10.384419,10.696193,10.410467,10.697694,10.761469,10.483742,10.679585,10.972121,11.052792,10.712454,10.169482,10.265448,10.620629,10.373682,10.640492,10.713255,10.346277,10.246524,10.135808,…,11.429776,11.204257,11.116279,11.544809,11.205092,10.86251,11.238046,11.320641,11.490739,11.152489,10.376964,11.518998,11.191118,11.438303,11.183986,10.654101,10.957441,10.947329,11.721783,11.638309,11.870332,11.831164,11.924983,11.490729,11.305936,11.604441,11.811594,11.559135,11.507401,11.45468,11.59823,11.529927,11.6681,11.643928,11.685324,11.191419,0.005341
"""AUP1""",6.682339,6.523731,6.681478,6.799754,6.729421,6.647916,6.770136,6.712803,6.68624,6.811994,6.852654,6.753911,6.964732,6.76706,6.851436,6.908582,6.823081,6.604561,6.813653,7.46521,6.989322,7.024059,6.940406,7.123805,7.02377,6.786023,6.804855,6.668906,6.536248,6.735259,7.018288,7.224904,7.218323,7.006858,7.024333,6.895222,…,12.720095,12.544937,12.889238,12.853719,12.802536,13.05958,12.97211,12.745483,12.718997,13.032622,13.155288,12.573123,12.981549,12.630884,12.981575,13.091263,12.763966,13.047703,12.922448,12.840821,12.847879,12.978594,12.822917,13.015279,12.860932,13.035074,12.936268,12.847449,12.97288,12.996055,13.040369,13.121185,12.814101,12.707347,12.887257,13.144932,0.005341
"""MFN2""",8.3612,9.2768,8.55826,8.436035,8.389405,8.495487,8.232539,8.275898,8.436035,8.627318,8.125586,8.856823,8.210976,8.66317,8.391306,8.381474,8.453367,9.192616,8.562998,8.763644,8.317722,8.511268,8.236602,8.054847,7.831402,8.697788,8.870129,9.04143,8.719067,8.770902,8.121642,8.226159,8.690885,8.268908,8.333067,8.434697,…,13.47017,12.980965,11.698055,11.607766,12.787303,11.753656,12.441578,12.020068,12.55211,12.199759,12.690182,12.601419,12.968764,12.868905,13.318573,11.873715,12.825695,12.13447,12.98986,12.560735,13.348827,13.584113,13.185934,13.042194,12.294371,12.693686,12.11024,12.661628,12.481293,13.005253,12.602113,12.753935,12.896002,13.162255,12.920142,12.895012,0.005341
"""BRD9""",11.123672,11.248276,11.205528,11.194139,10.881058,10.815154,11.077084,10.66156,10.123497,10.723772,10.429994,10.452714,10.294966,10.342431,10.601959,10.535865,10.587269,10.680437,10.492315,10.734742,10.599683,10.579981,10.307659,10.470758,10.347372,10.874624,10.940121,9.945358,10.305112,10.588596,10.670696,10.605209,10.474997,10.802198,10.501856,10.408323,…,11.088167,11.116641,11.563862,11.39446,11.332326,11.490871,11.556275,11.337509,11.104349,11.209293,11.329327,10.959484,11.062725,11.245509,11.122778,11.381965,11.204298,11.558059,11.233425,11.349307,11.129729,11.193376,11.234239,11.245336,11.270786,11.188424,11.302253,11.37671,11.507401,11.224893,11.577473,11.27572,11.376071,11.213389,11.082061,11.375956,0.005341
"""DCK""",10.497371,9.998956,10.189478,9.86767,10.093687,9.469722,10.402428,10.071455,10.40603,9.985641,9.662888,9.473138,9.856085,10.232526,10.277532,10.021891,10.22684,9.689802,9.583005,9.262831,9.807789,9.730286,10.077456,10.097452,10.167003,9.87774,10.069427,10.383796,10.873611,9.465091,9.5481,9.061194,10.220148,9.926984,10.674176,10.418742,…,10.508176,11.484227,11.563565,12.148371,11.496589,11.544969,11.551292,11.560639,11.857245,11.209804,11.020661,11.399772,11.13345,11.789762,11.370884,11.225942,11.741384,11.314793,11.686282,11.657012,11.608291,10.86653,11.3405,11.125418,11.221103,11.512172,11.493717,11.464792,11.720961,11.25605,11.216281,11.194905,11.344282,11.206951,11.357196,11.290448,0.005341


In [15]:
print(f"""
    Shape (all data): 
    RA: {high_frequency_genes_dataframe(ra_data, allowed_null_percentage=1, drop_frequencies_column=True).shape}
    T1D: {high_frequency_genes_dataframe(t1d_data, allowed_null_percentage=1, drop_frequencies_column=True).shape}
    SSc: {high_frequency_genes_dataframe(ssc_data, allowed_null_percentage=1, drop_frequencies_column=True).shape}
    SLE: {high_frequency_genes_dataframe(sle_data, allowed_null_percentage=1, drop_frequencies_column=True).shape}
    SjS: {high_frequency_genes_dataframe(sjs_data, allowed_null_percentage=1, drop_frequencies_column=True).shape}
""")

print(f"""
    Shape (only commom genes): 
    RA: {high_frequency_genes_dataframe(ra_data, allowed_null_percentage=0, drop_frequencies_column=True).shape}
    T1D: {high_frequency_genes_dataframe(t1d_data, allowed_null_percentage=0, drop_frequencies_column=True).shape}
    SSc: {high_frequency_genes_dataframe(ssc_data, allowed_null_percentage=0, drop_frequencies_column=True).shape}
    SLE: {high_frequency_genes_dataframe(sle_data, allowed_null_percentage=0, drop_frequencies_column=True).shape}
    SjS: {high_frequency_genes_dataframe(sjs_data, allowed_null_percentage=0, drop_frequencies_column=True).shape}
""")

allowed_null_percentage = 0.2

print(f"""
    Shape (including only high-frequency genes (genes that appear in 80% of the samples)): 
    RA: {high_frequency_genes_dataframe(ra_data, allowed_null_percentage=allowed_null_percentage, drop_frequencies_column=True).shape}
    T1D: {high_frequency_genes_dataframe(t1d_data, allowed_null_percentage=allowed_null_percentage, drop_frequencies_column=True).shape}
    SSc: {high_frequency_genes_dataframe(ssc_data, allowed_null_percentage=allowed_null_percentage, drop_frequencies_column=True).shape}
    SLE: {high_frequency_genes_dataframe(sle_data, allowed_null_percentage=allowed_null_percentage, drop_frequencies_column=True).shape}
    SjS: {high_frequency_genes_dataframe(sjs_data, allowed_null_percentage=allowed_null_percentage, drop_frequencies_column=True).shape}
""")


    Shape (all data): 
    RA: (28661, 1123)
    T1D: (27581, 177)
    SSc: (20411, 230)
    SLE: (27803, 2054)
    SjS: (23144, 401)

    Shape (only commom genes): 
    RA: (2602, 1123)
    T1D: (2504, 177)
    SSc: (6582, 230)
    SLE: (2050, 2054)
    SjS: (7683, 401)

    Shape (including only high-frequency genes (genes that appear in 80% of the samples)): 
    RA: (28661, 1123)
    T1D: (27581, 177)
    SSc: (20411, 230)
    SLE: (27803, 2054)
    SjS: (23144, 401)


> Observation: Allowing 20% of null values is keeping the whole dataset  

# Available tissues per condition

In [3]:
for condition in Condition:
    dataset = get_pre_processed_dataset(ConditionDataLoader(condition), files_path, metadata_path, datasets_info_path)
    print(f"{condition.name}: {dataset.select('Tissue').unique()}")

RA: shape: (3, 1)
┌───────────────────┐
│ Tissue            │
│ ---               │
│ str               │
╞═══════════════════╡
│ Peripheral blood  │
│ Whole blood       │
│ Synovial membrane │
└───────────────────┘
T1D: shape: (2, 1)
┌──────────────────┐
│ Tissue           │
│ ---              │
│ str              │
╞══════════════════╡
│ Whole blood      │
│ Peripheral blood │
└──────────────────┘
SSc: shape: (3, 1)
┌──────────────────┐
│ Tissue           │
│ ---              │
│ str              │
╞══════════════════╡
│ Whole blood      │
│ Peripheral blood │
│ Skin             │
└──────────────────┘
SLE: shape: (3, 1)
┌──────────────────┐
│ Tissue           │
│ ---              │
│ str              │
╞══════════════════╡
│ Peripheral blood │
│ Skin             │
│ Whole blood      │
└──────────────────┘
SjS: shape: (5, 1)
┌──────────────────┐
│ Tissue           │
│ ---              │
│ str              │
╞══════════════════╡
│ Salivary gland   │
│ Saliva           │
│ Whole blood  

# Sequencing Technique (Microarrays vs RNA-Seq) 

In [17]:
datasets_info = pl.read_csv(datasets_info_path)
datasets_info.shape

(41, 5)

In [18]:
datasets_info

Dataset,Disease,Method,Title,Samples
str,str,str,str,i64
"""GSE10325""","""SLE""","""Expression pro…","""Combined defic…",67
"""GSE104174""","""SSc""","""Expression pro…","""Changes in mac…",72
"""GSE108497""","""SLE""","""Expression pro…","""Illumina Human…",512
"""GSE110169""","""SLE RA""","""Expression pro…","""[HG-U219] Affy…",234
"""GSE110174""","""SLE""","""Expression pro…","""[HT_HG-U133_Pl…",154
"""GSE110914""","""T1D""","""Expression pro…","""Abnormal neutr…",42
"""GSE112341""","""T1D""","""Expression pro…","""Risk variants …",22
"""GSE117931""","""SSc""","""Expression pro…","""Illumina Human…",74
"""GSE11907""","""SLE""","""Expression pro…","""A modular anal…",546
"""GSE12021""","""RA""","""Expression pro…","""Identification…",57


In [20]:
datasets_info.group_by("Method").len()

Method,len
str,u32
"""Expression pro…",31
"""Expression pro…",10


# Per Condition/Sequencing/Tissue stats 

## Testing: 

In [4]:
# Test
sample_df = (
    get_pre_processed_dataset(
        ConditionSequencingTissueDataLoader(
            Condition.SjS, 
            SequencingTechnique.MICROARRAYS, 
            TissueEnum.SALIVA
        ), 
        files_path, 
        metadata_path, 
        datasets_info_path, 
        return_metadata=False
    )
)

sample_df.shape

(20, 20400)

In [6]:
sample_df.head(20)

Sample,RFC2,HSPA6,PAX8,GUCA1A,THRA,PTPN21,CCL5,CYP2E1,EPHB3,ESRRA,CYP2A6,SCARB1,TTLL12,WFDC2,MAPK1,PXK,VPS18,MSANTD3,SLC46A1,TIMD4,SLC39A5,ATP6V1E2,AFG3L1P,CILP2,PIGX,SLC39A13,BEST4,AK9,CORO6,TMEM106A,ALG10,TTC39C,C15orf40,RAX2,MFAP3,EYA3,…,LOC101930453,LOC101928631,MIR186,LOC102723709,LOC102723678,SNORA28,LOC101930071,KRTAP21-1,LOC101930288,ERVK3-1,LOC100128361,CENPP,FAM226B,FAM226A,LOC102724112,LOC100287834,LOC101928042,LOC101927579,LOC643406,GGT8P,LOC102724426,MIR29B1,MIR29A,LOC101927382,LOC102723773,LOC101060596,SMG1P7,LOC400499,LOC388210,MICOS10-NBL1,MROH7-TTC4,LOC102725263,LOC100129924,FAM86DP,FAM86B2,FAM86FP,FAM86B1
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""GSM180372""","""1.949048437782…","""2.055909620746…","""1.869243037720…","""1.785642790690…","""1.851159517929…","""2.184876836274…","""1.741172045361…","""2.009413279487…","""2.072968041506…","""2.162993776504…","""2.122132254169…","""1.871083978684…","""1.781954902030…","""2.046971629173…","""1.930857990728…","""1.760058178112…","""1.924358728474…","""1.937760715467…","""2.051349119450…","""1.909611297415…","""2.017249960249…","""1.976423891424…","""1.844740256154…","""1.899289577035…","""1.635043590172…","""1.794340319554…","""1.981746891549…","""1.803580327060…","""2.154714525056…","""2.103953724699…","""1.963874548764…","""1.855089581235…","""1.833179759581…","""2.155136868107…","""1.826123164784…","""1.823842252751…",…,"""1.813608677876…","""1.921655505833…","""1.798229187562…","""2.218773628512…","""2.218773628512…","""1.711817859705…","""2.104366287992…","""2.005624943411…","""1.824722602007…","""2.062926547876…","""2.096667115234…","""2.096667115234…","""2.019836567193…","""2.019836567193…","""2.103793866979…","""1.928321626884…","""1.940042866848…","""1.940042866848…","""2.249400091479…","""1.842140845867…","""1.950085648200…","""1.802357902962…","""1.802357902962…","""1.664396301913…","""2.132915504032…","""2.132915504032…","""2.132915504032…","""2.263099988435…","""2.263099988435…","""2.129892952028…","""1.796742330937…","""2.074755064941…","""2.097057027474…","""1.725456731611…","""1.725456731611…","""1.725456731611…","""1.725456731611…"
"""GSM180380""","""2.206475937901…","""2.309584841567…","""1.855599162464…","""1.808566476497…","""1.848871881098…","""1.840072669482…","""1.713262578641…","""1.795640081168…","""2.232213608204…","""2.282061304375…","""2.001845666792…","""1.829596687251…","""2.081400859397…","""2.064617444711…","""2.396895148521…","""1.990694282259…","""1.996010387405…","""1.988987221182…","""1.908961045909…","""2.051461247087…","""2.032249214766…","""1.941117494096…","""1.890595672870…","""1.811870582743…","""1.703669798139…","""1.867514424853…","""1.841042859779…","""2.006448903681…","""1.876239471914…","""2.109977180284…","""1.649519461306…","""2.004367685104…","""2.213269330067…","""2.123659383531…","""2.010087184777…","""1.808089709968…",…,"""1.725307374942…","""1.880993208594…","""1.727622841906…","""2.218773628512…","""2.218773628512…","""1.837242993182…","""2.292011720870…","""2.081731824526…","""1.780862936500…","""2.160596409490…","""1.847374262962…","""1.847374262962…","""1.897603702079…","""1.897603702079…","""2.180596538059…","""1.678627420800…","""1.649212028546…","""1.649212028546…","""1.796692323209…","""1.806395776029…","""2.044879462316…","""1.795018021148…","""1.795018021148…","""1.776649276645…","""2.032102606353…","""2.032102606353…","""2.032102606353…","""2.131370375768…","""2.131370375768…","""2.246609200498…","""1.832641369777…","""1.993724911255…","""2.004616345019…","""1.900851354439…","""1.900851354439…","""1.900851354439…","""1.900851354439…"
"""GSM180383""","""1.969827997843…","""3.523030724797…","""1.912279750221…","""2.014404098839…","""1.961961517902…","""2.164415012875…","""1.735512570599…","""1.928883469625…","""1.898976957992…","""2.305233667870…","""2.040542986379…","""1.812109023060…","""1.944027365441…","""1.802885785480…","""2.199263129277…","""1.879696276411…","""2.027874642340…","""1.921128350863…","""1.944882857485…","""2.045311377931…","""2.053576587906…","""2.075205473191…","""1.823740422508…","""1.871294582156…","""2.204485641706…","""2.019286751267…","""1.808364927739…","""1.902840925411…","""1.954458424389…","""1.954705106301…","""1.804237255725…","""1.948191707900…","""2.088991292222…","""2.429050902276…","""1.963911503059…","""1.995886352990…",…,"""1.879627312460…","""1.947766254642…","""1.706894253914…","""2.125308563982…","""2.125308563982…","""1.717888037372…","""1.992118886127…","""1.907117637365…","""1.770439738658…","""1.977829280715…","""1.924022327499…","""1.924022327499…","""1.848143682878…","""1.848143682878…","""2.116645483617…","""1.766533531090…","""1.799411861897…","""1.799411861897…","""1.941336803206…","""1.898740474084…","""1.849288392249…","""1.878399047024…","""1.878399047024…","""1.762392139884…","""2.194237694228…","""2.194237694228…","""2.194237694228…","""3.052432449775…","""3.052432449775…","""2.074799266641…","""1.998560821801…","""1.992247857952…","""2.253311068479…","""1.831480943273…","""1.831480943273…","""1.831480943273…","""1.831480943273…"
"""GSM180378""","""1.866911858033…","""2.224675636034…","""1.933101262972…","""1.952825515855…","""1.848743765876…","""2.195985549914…","""1.710959700907…","""1.885751666167…","""1.885066200829…","""2.031566647697…","""2.010087184777…","""1.992589789398…","""1.749026906515…","""1.985133255282…","""2.096297807990…","""2.100365521791…","""2.074463818781…","""1.681699436187…","""1.966641419062…","""2.055666133517…","""1.877409162011…","""2.091869996862…","""1.795278068520…","""1.874294596143…","""1.668439119384…","""2.214019766781…","""1.816627411113…","""2.000287724831…","""1.922269399842…","""2.010168340752…","""1.877725443623…","""1.960472595289…","""1.687491191581…","""2.081261504934…","""1.929654708710…","""1.981671124036…",…,"""1.838519670156…","""1.741427103832…","""2.124488920706…","""1.987446768204…","""1.987446768204…","""1.817677701682…","""2.061113168923…","""2.216025867648…","""1.745519681742…","""1.864454061157…","""1.934430584538…","""1.934430584538…","""2.420966011180…","""2.420966011180…","""1.929608739999…","""1.649981432051…","""1.673713136757…","""1.673713136757…","""1.831473769853…","""1.867172497112…","""2.266586250365…","""1.950812427093…","""1.950812427093…","""1.882990436167…","""2.196868266257…","""2.196868266257…","""2.196868266257…","""3.057680848995…","""3.057680848995…","""1.711042914007…","""2.287729775360…","""1.877296719397…","""2.176662231093…","""1.862007973429…","""1.862007973429…","""1.862007973429…","""1.862007973429…"
"""GSM180379""","""2.056513077278…","""3.612067166936…","""1.925153002890…","""1.935306448203…","""1.853160282599…","""1.898305634194…","""1.547257237944…","""2.133465378109…","""2.104005384462…","""2.137998280282…","""1.994123576649…","""1.769818805375…","""2.092674905861…","""1.857537708662…","""2.023142516905…","""1.839725885966…","""2.078074205602…","""1.915502299005…","""2.048788784668…","""1.734535675302…","""1.896549413193…","""1.854295266382…","""1.933722687966…","""1.845355689866…","""2.235544896215…","""2.041959108318…","""2.111831588574…","""1.719145236165…","""1.990385809652…","""1.875664591335…","""1.837266682156…","""1.790176701850…","""1.915487497320…","""2.015055234900…","""2.055419697733…","""2.069537396157…",…,"""1.953283999076…","""1.703789559686…","""1.836771878903…","""2.136579799873…","""2.136579799873…","""2.079178638021…","""2.326102503225…","""2.092909818559…","""1.734871382062…","""2.201025159479…","""1.948769911061…","""1.948769911061…","""1.879156350676…","""1.879156350676…","""2.505212311101…","""1.776505338787…","""1.951879358058…","""1.951879358058…","""1.751412992091…","""2.046794330041…","""1.915447135570…","""1.773825249797…","""1.773825249797…","""1.688263894374…","""2.104298429753…","""2.104298429753…","""2.104298429753…","""2.725481954267…","""2.725481954267…","""1.889441401849…","""1.591859584629…","""1.854689260271…","""1.956459734408…","""1.769621038141…","""1.769621038141…","""1.769621038141…","""1.769621038141…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GSM180382""","""1.915952681658…","""2.800924676641…","""1.908498220873…","""1.925441004137…","""2.079875394119…","""2.109446571420…","""1.697585462151…","""1.953942777951…","""1.949219051121…","""2.055928835875…","""1.834725407761…","""1.848138406361…","""1.899646947697…","""1.938830010803…","""2.017633231816…","""2.042925546572…","""2.126687240761…","""1.706873661762…","""1.959279336872…","""1.986793746302…","""1.837797955028…","""1.914441039069…","""1.954071780198…","""1.839421606303…","""1.817617308649…","""2.006055978730…","""1.888290049830…","""1.777350099690…","""1.808180430763…","""1.866764248178…","""1.742515042220…","""1.909190259718…","""2.012016330335…","""2.109502557059…","""2.008909707527…","""2.153405921685…",…,"""1.868490148564…","""1.918716433160…","""1.777507409992…","""2.171456642931…","""2.171456642931…","""1.798334229023…","""2.255548973513…","""1.852453478490…","""1.854795785907…","""2.185007126876…","""2.065417282555…","""2.065417282555…","""2.110006988663…","""2.110006988663…","""2.084997694457…","""1.832536662104…","""2.042925546572…","""2.042925546572…","""1.616995836626…","""2.160854781133…","""1.931233881487…","""2.117952088491…","""2.117952088491…","""1.746976622286…","""2.148714453823…","""2.148714453823…","""2.148714453823…","""2.227067934766…","""2.227067934766…","""1.891286238632…","""2.316168080276…","""1.727546126232…","""2.094401929614…","""1.767413206701…","""1.767413206701…","""1.767413206701…","""1.767413206701…"
"""GSM180386""","""1.828565704728…","""2.892420679685…","""1.930624679696…","""1.814272693848…","""2.066937324926…","""1.745976412481…","""1.786800624574…","""1.933559106906…","""1.959501554424…","""2.021031703523…","""2.061072645245…","""1.972539148378…","""1.871964928417…","""1.903295233585…","""2.009097712026…","""1.875165948067…","""1.924364754565…","""1.924485964289…","""2.037121879271…","""2.080631184462…","""1.820349872728…","""1.893832110741…","""2.063767437844…","""1.928025097084…","""1.759378286336…","""2.217688122010…","""1.687579941655…","""1.855870158210…","""1.934115597286…","""1.802693627093…","""1.723446747621…","""1.923155038479…","""1.966551610720…","""2.712044461936…","""1.962940240695…","""2.124230087786…",…,"""1.832635314956…","""1.877110840617…","""1.883253464234…","""1.962140522308…","""1.962140522308…","""1.911369487857…","""1.927620321362…","""1.847995477363…","""1.709436442365…","""1.898621938748…","""1.792265150611…","""1.792265150611…","""1.912266246063…","""1.912266246063…","""1.757837875650…","""1.868417737996…","""1.904450535214…","""1.904450535214…","""2.017583490487…","""2.021562388634…","""1.773713016714…","""1.807347316681…","""1.807347316681…","""1.770532826011…","""2.226030792266…","""2.226030792266…","""2.226030792266…","""2.629838037063…","""2.629838037063…","""1.868635534811…","""1.770196816739…","""1.909424094520…","""1.917891443909…","""1.793317478343…","""1.793317478343…","""1.793317478343…","""1.793317478343…"
"""GSM180387""","""1.883230090210…","""2.329898086869…","""1.901396428021…","""1.968273609601…","""1.932557544064…","""2.145556066328…","""2.149681945608…","""2.057173098605…","""1.934898869271…","""1.987388230896…","""2.124042242333…","""1.959109602033…","""1.690659072750…","""1.900467012720…","""2.073263033232…","""1.935766321950…","""2.000104718376…","""1.753312582476…","""2.030343554625…","""2.012243030440…","""1.989943979019…","""1.804325934079…","""1.876660889907…","""2.229680239721…","""1.639599083462…","""2.154495759421…","""1.770663862869…","""1.815248893697…","""2.001950653605…","""2.115467925855…","""2.036416664297…","""2.013206529173…","""1.925447364505…","""2.510076797754…","""2.078359869328…","""2.202916732404…",…,"""1.952237872849…","""2.216489166123…","""1.640208208758…","""1.786085913146…","""1.786085913146…","""1.798757422106…","""2.084283698013…","""2.011661192670…","""1.693148936022…","""2.107540596079…","""2.197926709941…","""2.197926709941…","""1.928680110624…","""1.928680110624…","""1.755044165505…","""2.009920101984…","""1.800759883530…","""1.800759883530…","""1.623054870985…","""1.817224496439…","""1.955905897037…","""1.849373915672…","""1.849373915672…","""2.038561696926…","""2.290380417638…","""2.290380417638…","""2.290380417638…","""1.908441763478…","""1.908441763478…","""2.179762888332…","""1.890067434375…","""1.990085077049…","""2.097954697904…","""1.832045643302…","""1.832045643302…","""1.832045643302…","""1.832045643302…"
"""GSM180373""","""1.921302531524…","""2.173117962900…","""1.876878758148…","""1.853039670928…","""2.043405891732…","""2.098449383551…","""1.648606225793…","""1.924898567260…","""1.948846621102…","""2.016814192869…","""2.049102736808…","""1.842188528313…","""1.760740691964…","""1.964399393038…","""2.144478260373…","""1.988756739304…","""1.959023181508…","""1.783660991673…","""2.079737135710…","""1.925230344216…","""1.961946683676…","""2.015400466225…","""1.976793282323…","""2.262170118352…","""1.776642415682…","""2.541211694778…","""2.096982491880…","""2.074942273483…","""2.083757171516…","""2.181862340130…","""1.710287112638…","""1.998195734177…","""2.127115335466…","""2.175148811800…","""1.814126306612…","""1.853917170319…",…,"""1.958543120629…","""1.702999727915…","""2.297787986125…","""2.368930577892…","""2.368930577892…","""1.782663130325…","""2.095315690188…","""1.944740753666…","""1.857384246301…","""2.141893538697…","""1.879627312460…","""1.879627312460…","""1.636740554051…","""1.636740554051…","""1.873282512243…","""1.570967408530…","""1.563428052550…","""1.563428052550…","""1.846934338143…","""2.051785395369…","""2.098860709402…","""1.96352788152""","""1.96352788152""","""2.147787577474…","""2.171331171118…","""2.171331171118…","""2.171331171118…","""1.801682749254…","""1.801682749254…","""1.857501472251…","""1.918487754457…","""1.794860107965…","""1.860685941713…","""1.860129050361…","""1.860129050361…","""1.860129050361…","""1.860129050361…"


## Stats

In [7]:
for sequencing_technique in SequencingTechnique: 
    for condition in Condition:
        for tissue in TissueEnum: 
            dataset = get_pre_processed_dataset(
                ConditionSequencingTissueDataLoader(
                    condition, 
                    sequencing_technique, 
                    tissue
                ), 
                files_path, 
                metadata_path, 
                datasets_info_path,
                return_metadata=False
            )
            
            if dataset is not None:
                shape = dataset.shape
                print(f"{sequencing_technique.name}|{condition.name}|{tissue.name}: Samples: {shape[0]}, Genes: {shape[1] - 1}")  # -1 in genes because first column is the Sample 

MICROARRAYS|RA|PERIPHERAL_BLOOD: Samples: 302, Genes: 9667
MICROARRAYS|RA|WHOLE_BLOOD: Samples: 513, Genes: 13774
MICROARRAYS|RA|SYNOVIAL_MEMBRANE: Samples: 103, Genes: 13895
MICROARRAYS|T1D|PERIPHERAL_BLOOD: Samples: 79, Genes: 4977
MICROARRAYS|SSc|PERIPHERAL_BLOOD: Samples: 14, Genes: 11853
MICROARRAYS|SSc|WHOLE_BLOOD: Samples: 37, Genes: 14761
MICROARRAYS|SSc|SKIN: Samples: 33, Genes: 10477
MICROARRAYS|SLE|PERIPHERAL_BLOOD: Samples: 812, Genes: 6521
MICROARRAYS|SLE|WHOLE_BLOOD: Samples: 1094, Genes: 8444
MICROARRAYS|SjS|PERIPHERAL_BLOOD: Samples: 222, Genes: 12987
MICROARRAYS|SjS|WHOLE_BLOOD: Samples: 108, Genes: 9683
MICROARRAYS|SjS|PAROTIC_GLAND: Samples: 35, Genes: 22803
MICROARRAYS|SjS|SALIVARY_GLAND: Samples: 15, Genes: 22803
MICROARRAYS|SjS|SALIVA: Samples: 20, Genes: 20399
RNA_SEQ|RA|WHOLE_BLOOD: Samples: 24, Genes: 13635
RNA_SEQ|RA|SYNOVIAL_MEMBRANE: Samples: 180, Genes: 19366
RNA_SEQ|T1D|PERIPHERAL_BLOOD: Samples: 43, Genes: 7433
RNA_SEQ|T1D|WHOLE_BLOOD: Samples: 54, Genes: