# Create pre-processed datasets for R

> Will not be included in the repository (will be added to gitignore), since this is data derived from the initial parquet files. 
> They can be re-created through this notebook.  

In [1]:
from adex.models import Condition, ConditionDataLoader, ConditionSequencingTissueDataLoader, \
    ConditionSequencingDataLoader, SequencingTechnique, TissueEnum, METADATA_COLUMNS, FileDataLoader, DataLoader
from adex.helpers import get_pre_processed_dataset

data_path = "../data/adex-database/samples"
metadata_path = "../data/adex-database/metadata.csv"
datasets_info_path = "../data/adex-database/datasets_info.csv"


In [7]:
from adex.models import DATASET_INFO_COLUMNS


def build_r_dataset(
    data_loader: DataLoader
) -> None:
    transposed = (get_pre_processed_dataset(
        data_loader=data_loader,
        data_path=data_path,
        metadata_path=metadata_path,
        datasets_info_path=datasets_info_path,
        return_metadata=True
    ).fill_null(value=0)
    .drop(filter(lambda item: item != 'Condition',METADATA_COLUMNS)) # We need the condition for R analysis 
    .drop(DATASET_INFO_COLUMNS)
    .transpose(include_header=True))

    dataset = (
        transposed
        .rename(transposed.head(1).to_dicts().pop())    # add header
        .slice(1,)                                      # remove first row because it is the header duplicated
        .rename({"Sample": "gene"})                     # fix header
    )
    
    match data_loader: 
        case ConditionSequencingTissueDataLoader(condition, sequencing_technique, tissue, _):
            dataset.write_csv(f"../data/r_pre_processed_datasets/{condition.name}_{sequencing_technique.name}_{tissue.name}.csv")
        case FileDataLoader(condition, file_name, _, _):
            dataset.write_csv(f"../data/r_pre_processed_datasets/{condition.name}_{file_name.strip('.tsv.parquet')}.csv")
        case _:
            raise ValueError(f"DataLoader '{data_loader}' not handled in Create_Datasets_For_R")
    

# RNA SEQ datasets

In [12]:
build_r_dataset(
    ConditionSequencingTissueDataLoader(
        condition=Condition.RA,
        sequencing_technique=SequencingTechnique.RNA_SEQ,
        tissue=TissueEnum.SYNOVIAL_MEMBRANE
    )
)

# Same as above but removing the bad dataset (i.e. GSE90081)
build_r_dataset(
    FileDataLoader(
        condition=Condition.RA,
        file_name="GSE89408.tsv.parquet",
    )
)


In [13]:
build_r_dataset(
    ConditionSequencingTissueDataLoader(
        condition=Condition.SLE,
        sequencing_technique=SequencingTechnique.RNA_SEQ,
        tissue=TissueEnum.WHOLE_BLOOD
    )
)

In [4]:
build_r_dataset(
    ConditionSequencingTissueDataLoader(
        condition=Condition.SSc,
        sequencing_technique=SequencingTechnique.RNA_SEQ,
        tissue=TissueEnum.PERIPHERAL_BLOOD
    )
)

In [5]:
build_r_dataset(
    ConditionSequencingTissueDataLoader(
        condition=Condition.SSc,
        sequencing_technique=SequencingTechnique.RNA_SEQ,
        tissue=TissueEnum.WHOLE_BLOOD
    )
)