# Create pre-processed datasets for R

> Will not be included in the repository (will be added to gitignore), since this is data derived from the initial parquet files. 
> They can be re-created through this notebook.  

In [5]:
from adex.models import Condition, ConditionDataLoader, ConditionSequencingTissueDataLoader, \
    ConditionSequencingDataLoader, SequencingTechnique, TissueEnum, METADATA_COLUMNS
from adex.helpers import get_pre_processed_dataset

data_path = "../data/adex-database/samples"
metadata_path = "../data/adex-database/metadata.csv"
datasets_info_path = "../data/adex-database/datasets_info.csv"


In [11]:
from adex.models import DATASET_INFO_COLUMNS


def build_r_dataset(
    condition: Condition, 
    sequencing_technique: SequencingTechnique,
    tissue: TissueEnum
) -> None:
    transposed = (get_pre_processed_dataset(
        data_loader=ConditionSequencingTissueDataLoader(
            condition=condition,
            sequencing_technique=sequencing_technique,
            tissue=tissue
        ),
        data_path=data_path,
        metadata_path=metadata_path,
        datasets_info_path=datasets_info_path,
        return_metadata=True
    )
    .drop(filter(lambda item: item != 'Condition',METADATA_COLUMNS)) # We need the condition for R analysis 
    .drop(DATASET_INFO_COLUMNS)
    .transpose(include_header=True))

    dataset = (
        transposed
        .rename(transposed.head(1).to_dicts().pop())    # add header
        .slice(1,)                                      # remove first row because it is the header duplicated
        .rename({"Sample": "gene"})                     # fix header
    )
    
    dataset.write_csv(f"../data/r_pre_processed_datasets/{condition.name}_{sequencing_technique.name}_{tissue.name}.csv")

In [12]:
build_r_dataset(Condition.RA, SequencingTechnique.RNA_SEQ, TissueEnum.SYNOVIAL_MEMBRANE)

In [13]:
build_r_dataset(Condition.SLE, SequencingTechnique.RNA_SEQ, TissueEnum.WHOLE_BLOOD)