In [110]:
import polars as pl

from adex.models import Condition, ConditionDataLoader, ConditionSequencingTissueDataLoader, \
    ConditionSequencingDataLoader, SequencingTechnique, TissueEnum, METADATA_COLUMNS, FileDataLoader, DataLoader
from adex.helpers import get_pre_processed_dataset
from adex.models import DATASET_INFO_COLUMNS
from sklearn import model_selection

data_path = "../data/adex-database/samples"
metadata_path = "../data/adex-database/metadata.csv"
datasets_info_path = "../data/adex-database/datasets_info.csv"
results_path = "../results"

# Strategy: All Differentially expressed genes of the GSE89408 dataset

## Prepare Data

In [111]:
de_genes_ra_GSE89408_synovial = (
    pl
    .read_csv(f"{results_path}/RA_GSE89408/edgeR_de_genes.csv", has_header=True)
    .to_series()
    .to_list()
)
len(de_genes_ra_GSE89408_synovial)

148

In [112]:
de_GSE89408 = (
    get_pre_processed_dataset(
        data_loader=FileDataLoader(
            condition=Condition.RA,
            file_name="GSE89408.tsv.parquet",
            genes=de_genes_ra_GSE89408_synovial
        ),
        data_path=data_path,
        metadata_path=metadata_path,
        datasets_info_path=datasets_info_path,
        return_metadata=True
    ).fill_null(value=0)
    .drop(filter(lambda item: item != 'Condition',METADATA_COLUMNS)) # We need the condition  
    .drop(DATASET_INFO_COLUMNS)
    .with_columns(
        pl.col("Condition")
            .map_elements(lambda x: 0 if x == "Healthy" else 1, return_dtype=pl.Int8)
            .alias("class_label")
    )
    .drop("Condition")
).sort("Sample")  # Remove randomness at this point 
de_GSE89408

Sample,AL136295.1,RNF208,AC008993.2,AC090971.1,AC022973.3,AP001273.2,HOXC-AS1,AC007546.1,AC023825.1,TCF4-AS2,B3GALNT1P1,AP003696.1,MTND1P23,AC022079.1,TRGC1,AL035078.4,AC090617.3,AC037198.2,AC006027.1,TAS2R43,AL512504.1,UBE2FP1,RPL26P19,AC133065.1,HIST1H2BB,AL034418.1,C4orf45,AC005224.3,GPR19,RBM43P1,AC099560.2,NUP62CL,AC144521.1,TAS2R64P,HIST1H2BO,KLRC1,…,AC103591.3,AC132217.1,COQ10BP2,HSPE1-MOB4,SPATA20P1,LINC02577,XCL1,AL139099.1,KIF14,MELK,AC008833.1,AC112496.1,E2F7,AC013444.2,AC091185.2,AC005306.1,AC096667.1,LINC02605,AC015911.3,LINC01619,AC211429.1,AL513475.2,C12orf74,AC073896.1,DM1-AS,AC130895.1,AC020909.4,AC113208.2,AD000671.1,PLCH1,AC126283.2,STYK1,AC025259.1,AC005224.2,AP000462.2,AC006272.1,class_label
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8
"""GSM2370970""",0.0,5.706065,1.149989,0.0,1.149989,0.0,7.776255,1.149989,1.149989,1.149989,0.0,0.0,5.184064,1.149989,0.0,0.0,0.0,3.721507,0.0,3.849033,0.0,1.149989,0.0,2.554955,2.826927,3.581605,0.0,3.849033,3.05567,4.175342,1.149989,3.05567,0.0,4.594747,3.25306,3.966197,…,0.0,0.0,0.0,0.0,0.0,1.781672,0.0,0.0,5.083907,4.175342,3.721507,1.781672,4.73344,3.05567,1.149989,6.431933,1.781672,4.798087,3.05567,0.0,6.472111,3.25306,0.0,8.464858,6.622488,1.149989,0.0,5.407698,0.0,3.849033,4.73344,1.781672,0.0,2.826927,2.554955,0.0,0
"""GSM2370971""",0.0,4.13637,7.947064,4.13637,4.13637,6.775631,5.827561,4.651071,3.094096,3.328693,5.274107,5.029652,5.865367,3.707378,2.465583,4.889904,5.707811,4.735153,4.96147,0.0,2.465583,3.530425,4.889904,1.326442,3.864979,5.48308,3.328693,3.530425,2.813799,1.326442,4.735153,3.094096,5.329256,2.00564,1.326442,2.813799,…,2.813799,7.718286,3.707378,6.562024,0.0,0.0,1.326442,7.078922,4.466606,3.094096,7.577331,3.094096,1.326442,1.326442,3.530425,0.0,0.0,4.466606,0.0,3.864979,3.094096,3.864979,2.813799,3.530425,4.364701,5.865367,5.274107,2.813799,5.827561,5.382374,5.094758,4.96147,6.073492,3.328693,5.48308,2.813799,0
"""GSM2370972""",0.0,4.359198,7.657699,4.240043,3.056805,3.314439,5.136107,4.30085,2.90839,4.17656,4.110154,4.521291,4.756554,3.427834,3.630947,0.0,2.742939,3.630947,6.046098,1.782587,3.722698,4.521291,3.808962,3.056805,4.040544,3.808962,3.532962,0.858757,2.742939,3.056805,2.742939,3.630947,3.890357,1.782587,1.393389,2.088786,…,3.314439,6.587614,3.427834,0.0,0.0,2.088786,1.782587,0.0,3.056805,1.393389,3.056805,3.722698,0.0,3.427834,3.056805,1.393389,2.556025,2.556025,4.469258,3.722698,3.056805,4.469258,3.808962,5.381067,3.808962,5.323615,3.808962,4.30085,3.056805,3.427834,7.138709,1.782587,0.0,2.088786,2.90839,2.341239,0
"""GSM2370973""",0.0,3.825534,6.282976,0.0,3.11884,0.0,5.965672,3.58271,3.290575,2.102451,3.290575,4.521211,4.652992,3.933163,0.0,4.214905,0.0,4.521211,3.709225,5.417139,2.923865,4.033316,2.430957,3.11884,1.676383,3.58271,3.58271,3.11884,2.69836,4.033316,2.430957,3.709225,3.11884,6.199209,2.69836,2.430957,…,1.069102,0.0,3.444024,0.0,6.875307,2.923865,0.0,3.11884,4.376172,3.825534,3.290575,2.102451,2.69836,4.297791,3.290575,4.773736,3.709225,4.937789,1.676383,2.430957,5.131011,2.69836,3.933163,7.353349,4.714627,3.58271,0.0,4.521211,0.0,3.11884,5.341011,2.102451,0.0,2.430957,3.58271,2.923865,0
"""GSM2370974""",5.161323,4.537568,7.466484,3.860753,3.277464,4.274953,6.059763,2.543839,3.414511,3.654817,3.860753,4.537568,4.952082,3.860753,4.537568,0.0,3.414511,2.765039,5.758763,4.412228,3.126018,2.956789,2.543839,2.282472,3.654817,3.860753,3.654817,3.761456,1.552158,1.552158,2.282472,1.963042,4.040931,5.623922,3.414511,2.765039,…,3.126018,5.475165,2.956789,0.0,0.0,0.0,0.0,0.0,2.765039,3.654817,3.277464,1.963042,0.975467,1.963042,3.277464,5.905518,2.543839,2.765039,4.040931,3.953653,4.70726,3.761456,3.539662,5.706329,4.652884,5.039452,3.860753,3.953653,4.596378,2.543839,0.0,2.543839,0.0,2.282472,3.761456,0.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GSM2371183""",7.223622,3.245415,6.934441,5.649734,4.024164,5.353052,4.167273,3.946908,4.527134,3.946908,4.024164,4.579263,2.794609,4.167273,3.778756,6.289089,5.832659,3.86528,5.649734,4.939797,3.686708,3.369022,4.167273,2.794609,6.572382,4.527134,3.946908,4.024164,3.86528,4.473051,4.527134,4.678188,3.482869,6.350373,5.673909,4.024164,…,5.697685,5.956079,4.297458,6.852084,3.588386,1.431192,4.899359,8.181333,6.273352,5.38269,3.369022,3.110216,4.725218,5.468117,2.606461,2.961026,5.054726,5.649734,3.369022,4.024164,2.606461,4.097492,4.899359,4.297458,3.110216,5.017425,4.979133,2.390044,5.522382,4.167273,7.182086,2.794609,5.721076,2.606461,3.369022,4.024164,1
"""GSM2371184""",9.765952,1.352949,6.896388,2.038673,4.968532,5.665017,1.352949,2.288607,2.501569,4.048181,4.556913,4.735877,2.851469,3.747892,4.352562,7.756727,6.284286,3.747892,6.408274,3.472902,4.296607,3.829024,5.398827,3.829024,5.831779,3.661924,4.406426,3.255327,2.501569,4.932269,4.777353,4.238395,0.0,2.038673,5.288405,2.999011,…,5.344672,5.288405,3.905835,8.125291,4.968532,5.137315,2.851469,10.372098,6.100424,6.434437,0.0,0.82958,3.132854,5.642748,2.501569,1.736174,4.932269,4.895071,0.0,2.851469,2.288607,2.288607,3.747892,0.0,1.736174,4.856889,7.544912,0.82958,0.0,4.932269,7.99071,2.288607,4.777353,2.288607,2.038673,4.458352,1
"""GSM2371185""",8.656909,3.476277,7.174895,5.948955,4.017369,5.541681,3.772043,3.476277,3.940138,4.718251,4.090675,5.22096,4.090675,4.932793,3.940138,7.319137,5.404661,3.680031,6.49944,5.690584,4.807931,4.290586,4.226978,4.090675,4.892361,4.892361,2.384187,1.42664,3.476277,5.119522,4.409961,4.622625,3.238939,6.986079,3.858537,3.103816,…,5.515299,6.328143,4.090675,6.937231,3.680031,1.42664,3.103816,9.312345,5.713973,4.572325,4.017369,3.772043,3.476277,5.541681,3.858537,0.882917,4.466137,5.084059,3.772043,3.476277,3.772043,3.362483,3.680031,4.160436,3.238939,4.932793,6.416332,1.820607,5.375623,3.581748,7.393448,3.103816,5.047704,0.882917,3.940138,4.520208,1
"""GSM2371186""",9.90724,0.801585,7.767922,5.361867,3.309785,7.579028,3.19742,4.235142,4.869604,3.845417,3.602317,4.832463,1.313844,4.177068,3.987348,8.65187,6.205851,4.630899,6.65197,4.79434,3.602317,3.987348,4.755183,3.075558,6.036554,3.602317,3.511242,3.768847,2.942446,2.942446,4.941135,3.075558,3.602317,5.986713,5.986713,2.448396,…,3.687983,7.443959,3.602317,6.764702,5.225265,5.623398,3.309785,10.422702,5.687201,6.330964,4.116559,2.795792,5.196321,4.177068,3.768847,1.691108,4.290968,4.586973,2.942446,2.632525,0.0,3.414027,4.586973,3.075558,2.448396,4.586973,6.813169,0.801585,4.975613,2.795792,7.550722,1.691108,5.986713,3.602317,3.987348,0.0,1


In [113]:
de_GSE89408.select(pl.col("class_label").value_counts()) # Imbalanced data between the two classes

class_label
struct[2]
"{1,152}"
"{0,28}"


In [114]:
x_df = (
        de_GSE89408.select([
            pl.all().exclude("Sample", "class_label"),
        ]).to_pandas()
 )
x_df

Unnamed: 0,AL136295.1,RNF208,AC008993.2,AC090971.1,AC022973.3,AP001273.2,HOXC-AS1,AC007546.1,AC023825.1,TCF4-AS2,...,AC020909.4,AC113208.2,AD000671.1,PLCH1,AC126283.2,STYK1,AC025259.1,AC005224.2,AP000462.2,AC006272.1
0,0.000000,5.706065,1.149989,0.000000,1.149989,0.000000,7.776255,1.149989,1.149989,1.149989,...,0.000000,5.407698,0.000000,3.849033,4.733440,1.781672,0.000000,2.826927,2.554955,0.000000
1,0.000000,4.136370,7.947064,4.136370,4.136370,6.775631,5.827561,4.651071,3.094096,3.328693,...,5.274107,2.813799,5.827561,5.382374,5.094758,4.961470,6.073492,3.328693,5.483080,2.813799
2,0.000000,4.359198,7.657699,4.240043,3.056805,3.314439,5.136107,4.300850,2.908390,4.176560,...,3.808962,4.300850,3.056805,3.427834,7.138709,1.782587,0.000000,2.088786,2.908390,2.341239
3,0.000000,3.825534,6.282976,0.000000,3.118840,0.000000,5.965672,3.582710,3.290575,2.102451,...,0.000000,4.521211,0.000000,3.118840,5.341011,2.102451,0.000000,2.430957,3.582710,2.923865
4,5.161323,4.537568,7.466484,3.860753,3.277464,4.274953,6.059763,2.543839,3.414511,3.654817,...,3.860753,3.953653,4.596378,2.543839,0.000000,2.543839,0.000000,2.282472,3.761456,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,7.223622,3.245415,6.934441,5.649734,4.024164,5.353052,4.167273,3.946908,4.527134,3.946908,...,4.979133,2.390044,5.522382,4.167273,7.182086,2.794609,5.721076,2.606461,3.369022,4.024164
176,9.765952,1.352949,6.896388,2.038673,4.968532,5.665017,1.352949,2.288607,2.501569,4.048181,...,7.544912,0.829580,0.000000,4.932269,7.990710,2.288607,4.777353,2.288607,2.038673,4.458352
177,8.656909,3.476277,7.174895,5.948955,4.017369,5.541681,3.772043,3.476277,3.940138,4.718251,...,6.416332,1.820607,5.375623,3.581748,7.393448,3.103816,5.047704,0.882917,3.940138,4.520208
178,9.907240,0.801585,7.767922,5.361867,3.309785,7.579028,3.197420,4.235142,4.869604,3.845417,...,6.813169,0.801585,4.975613,2.795792,7.550722,1.691108,5.986713,3.602317,3.987348,0.000000


In [115]:
y_df = de_GSE89408.select(pl.col("class_label").alias("y")).to_pandas()
y_df

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
175,1
176,1
177,1
178,1


In [116]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x_df, y_df, 
    train_size=0.7, 
    random_state=42,  # Ensure a reproducable split 
    shuffle=True, 
    stratify=y_df
)

In [117]:
x_train

Unnamed: 0,AL136295.1,RNF208,AC008993.2,AC090971.1,AC022973.3,AP001273.2,HOXC-AS1,AC007546.1,AC023825.1,TCF4-AS2,...,AC020909.4,AC113208.2,AD000671.1,PLCH1,AC126283.2,STYK1,AC025259.1,AC005224.2,AP000462.2,AC006272.1
13,5.374885,5.069209,4.729543,0.000000,1.465022,0.000000,6.532242,2.840437,3.157536,1.864341,...,0.000000,5.819191,0.000000,2.651231,2.840437,2.176755,0.000000,2.433411,3.007681,0.910992
73,8.482869,1.949256,7.061717,3.109585,4.689403,0.000000,2.267737,4.327569,5.181697,3.936286,...,6.487267,0.966370,0.000000,5.791119,7.366968,4.257345,6.062215,4.689403,5.457013,4.105728
4,5.161323,4.537568,7.466484,3.860753,3.277464,4.274953,6.059763,2.543839,3.414511,3.654817,...,3.860753,3.953653,4.596378,2.543839,0.000000,2.543839,0.000000,2.282472,3.761456,0.000000
5,7.763535,4.355941,7.582487,4.033571,4.033571,5.079104,4.619271,4.355941,5.893153,3.201774,...,3.617796,3.617796,4.619271,4.426415,0.000000,4.281848,0.000000,4.678223,3.491997,5.079104
57,0.000000,2.938921,7.779540,6.340215,4.719174,5.728283,3.544659,4.298032,5.691525,4.719174,...,5.534341,3.841646,0.000000,6.981459,0.000000,4.298032,3.841646,4.087811,3.841646,4.790434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,7.249282,2.201206,7.825369,6.292242,4.981323,4.852470,0.925741,4.608285,3.445760,4.939638,...,6.189286,2.201206,5.435761,5.732818,6.388338,3.765028,3.321436,5.061246,3.185380,5.435761
64,7.378151,2.426147,7.744690,5.832748,5.022228,5.426124,3.820045,4.569717,3.523430,4.621913,...,5.764730,2.169813,5.426124,5.097892,0.000000,1.459340,4.857842,2.832766,4.672287,3.906756
176,9.765952,1.352949,6.896388,2.038673,4.968532,5.665017,1.352949,2.288607,2.501569,4.048181,...,7.544912,0.829580,0.000000,4.932269,7.990710,2.288607,4.777353,2.288607,2.038673,4.458352
89,6.843731,1.474597,8.542349,6.340877,5.449477,6.003526,0.918027,5.193031,5.615729,3.651211,...,6.062321,2.188439,5.449477,4.298668,6.476762,4.161974,5.788198,4.791080,3.749901,5.227692


In [118]:
y_train

Unnamed: 0,y
13,0
73,1
4,0
5,0
57,1
...,...
53,1
64,1
176,1
89,1


In [119]:
y_train.value_counts()

y
1    106
0     19
Name: count, dtype: int64

In [120]:
x_test

Unnamed: 0,AL136295.1,RNF208,AC008993.2,AC090971.1,AC022973.3,AP001273.2,HOXC-AS1,AC007546.1,AC023825.1,TCF4-AS2,...,AC020909.4,AC113208.2,AD000671.1,PLCH1,AC126283.2,STYK1,AC025259.1,AC005224.2,AP000462.2,AC006272.1
18,7.82782,3.219739,8.101201,5.927814,3.997214,0.0,4.331076,4.2702,5.063206,4.551743,...,5.412135,0.873119,5.966702,4.551743,7.227468,2.582486,4.206642,2.366837,0.873119,3.997214
98,6.288057,3.35857,7.959354,5.465989,6.04003,0.0,0.0,5.361582,5.168794,3.035637,...,2.618922,5.249026,0.0,6.189839,5.531619,4.286473,5.287529,4.360579,3.944872,0.0
166,8.757958,2.998015,6.667347,6.118779,4.78441,6.706492,3.763433,5.326911,4.151664,4.78441,...,5.757994,5.167719,5.167719,4.855791,6.38319,3.904773,4.151664,5.515057,4.546265,3.230634
140,9.647547,1.02594,6.574435,3.747971,3.506254,3.855164,2.038738,4.856972,4.90766,2.627941,...,6.590038,4.749918,0.0,3.954941,6.260193,4.296689,5.778389,5.049843,3.506254,3.632169
29,7.88212,3.040566,7.416682,4.812782,5.027822,4.666471,2.20604,4.50363,4.812782,4.320042,...,5.471475,2.682821,6.159354,4.253259,7.014573,3.326994,5.316553,4.253259,3.190877,5.249605
90,7.532159,2.823497,7.957059,5.73074,4.559386,5.865772,2.417373,4.199114,4.505247,5.355656,...,5.528463,2.16143,5.682767,4.661924,6.656288,2.634679,4.661924,3.89669,5.799835,3.140051
162,9.252759,2.982486,7.413785,6.221819,4.985595,5.852045,3.238416,4.33459,4.675007,2.67115,...,7.111682,2.024394,5.24102,4.159883,7.913412,3.351151,5.380375,1.341476,0.821345,4.388421
105,0.0,3.322967,7.821328,5.210576,4.249021,5.045513,0.0,4.001082,5.427392,3.586021,...,5.524691,3.701505,3.586021,5.673557,0.0,4.171015,0.0,5.171047,3.586021,4.460533
74,9.560818,3.846907,4.16668,0.0,3.025182,3.025182,1.477789,3.549829,3.846907,3.754515,...,4.64964,3.311224,6.645003,5.594197,7.551171,0.0,5.838809,5.511924,0.0,2.449704
45,8.50304,0.0,7.601662,5.082836,4.681919,0.0,3.205202,3.205202,6.147585,4.430082,...,4.124789,2.353716,3.737057,4.681919,7.827353,2.353716,6.578365,4.124789,3.205202,5.248023


In [121]:
y_test

Unnamed: 0,y
18,0
98,1
166,1
140,1
29,1
90,1
162,1
105,1
74,1
45,1


In [122]:
y_test.value_counts()

y
1    46
0     9
Name: count, dtype: int64