# Creates a dataset for R with the data of the file GSE89408 but uses only the train set samples

In [5]:
from typing import List

import polars as pl

from adex.models import Condition, ConditionDataLoader, ConditionSequencingTissueDataLoader, \
    ConditionSequencingDataLoader, SequencingTechnique, TissueEnum, METADATA_COLUMNS, FileDataLoader, DataLoader
from adex.helpers import get_pre_processed_dataset

data_path = "../data/adex-database/samples"
metadata_path = "../data/adex-database/metadata.csv"
datasets_info_path = "../data/adex-database/datasets_info.csv"

In [6]:
train_set_samples: List[str] = pl.read_csv("../data/ml/GSE89408_train_set_samples.csv", has_header=False).to_series().to_list()
len(train_set_samples)

125

In [7]:
from adex.models import DATASET_INFO_COLUMNS

data_loader = FileDataLoader(
    condition=Condition.RA,
    file_name="GSE89408.tsv.parquet",
    samples=train_set_samples
) 

transposed = (get_pre_processed_dataset(
    data_loader=data_loader,
    data_path=data_path,
    metadata_path=metadata_path,
    datasets_info_path=datasets_info_path,
    return_metadata=True
).fill_null(value=0)
              .drop(filter(lambda item: item != 'Condition',METADATA_COLUMNS)) # We need the condition for R analysis 
              .drop(DATASET_INFO_COLUMNS)
              .transpose(include_header=True))

dataset = (
    transposed
    .rename(transposed.head(1).to_dicts().pop())    # add header
    .slice(1,)                                      # remove first row because it is the header duplicated
    .rename({"Sample": "gene"})                     # fix header
)

dataset.write_csv(f"../data/r_pre_processed_datasets/GSE89408_only_train_samples.csv")
