# Suo Dataset

In [1]:
0

0

In [2]:
import copy
import gc
import os
import sys
import warnings

import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
from scanpy.preprocessing._highly_variable_genes import highly_variable_genes

sys.path.append("/home/icb/kemal.inecik/work/codes/tardis")
import tardis
from tardis._utils.preprocessing import (  # noqa
    NA_CELL_TYPE_PLACEHOLDER,
    RANK_GENES_GROUPS_KEY,
    calculate_de_genes,
    deep_memory_usage,
    select_hvgs,
)
config = tardis.config_server

sc.settings.verbosity = 3

In [3]:
suo_raw_dir = os.path.join(config.io_directories["processed"], "Suo_raw.h5ad")

if os.path.isfile(suo_raw_dir):
    print('reading h5ad')
    adata = ad.read_h5ad(suo_raw_dir)
else:
    print('getting h5ad')
    adata_file_path = (
        "/lustre/groups/ml01/workspace/kemal.inecik/hdca/" 
        "temp/preprocessing/unification_union_20240330_hvg.h5ad"
    )
    assert os.path.isfile(adata_file_path), f"File not already exist: `{adata_file_path}`"
    adata = ad.read_h5ad(adata_file_path)
    
    handle = "Suo"
    adata = adata[adata.obs["handle_anndata"] == handle]
    GENE_NAME_COLUMN = "hgnc"
    var_column = f"{GENE_NAME_COLUMN}_{handle}"
    adata = adata[:, adata.var[var_column] != NA_CELL_TYPE_PLACEHOLDER].copy()
    adata.write_h5ad(suo_raw_dir)

    del adata.uns
    k = sorted(adata.var.keys())
    for i in k:
        if i != GENE_NAME_COLUMN:
            del adata.var[i]
    gc.collect();
adata

getting h5ad


AnnData object with n_obs × n_vars = 841966 × 31789
    obs: 'handle_anndata', 'study', 'sample_ID', 'organ', 'age', 'cell_type', 'lane_ID', 'author_batch', 'institute', 'study_PI', 'doi', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'anatomical_region', 'anatomical_region_level_2', 'sex', 'sex_inferred', 'subject_type', 'sample_status', 'sample_cultured', 'protocol_tissue_dissociation', 'cell_enrichment', 'library_platform', 'strand_sequence', 'sequencing_platform', 'reads_processing', 'biological_unit', 'reference_genome', 'reference_genome_ensembl_release', 'concatenated_integration_covariates'
    var: 'hgnc'

In [4]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_donor", "age", "sex", "sex_inferred", "integration_biological_unit", "integration_sample_status"]).size()
    display(k[k!=0])

integration_donor  age   sex  sex_inferred  integration_biological_unit  integration_sample_status
F19_Suo_et_al      10.0  F    F             Cell                         Fresh                         5272
F21_Suo_et_al      16.0  M    M             Cell                         Fresh                        32836
F22_Suo_et_al      9.0   F    F             Cell                         Fresh                        15126
F23_Suo_et_al      11.0  M    M             Cell                         Fresh                        22418
F29_Suo_et_al      17.0  F    F             Cell                         Fresh                        45934
F30_Suo_et_al      14.0  M    M             Cell                         Fresh                        54522
F32_Suo_et_al      7.0   F    F             Cell                         Fresh                        28110
F33_Suo_et_al      9.0   F    F             Cell                         Fresh                        38506
F34_Suo_et_al      8.0   F    F      

In [5]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_donor", "study"]).size()
    display(k[k!=0])

integration_donor  study                                
F19_Suo_et_al      Suo_et_al_2022_Science_Popescu2019        5209
                   Suo_et_al_2022_Science_Suo2022              63
F21_Suo_et_al      Suo_et_al_2022_Science_Jardine2021        6064
                   Suo_et_al_2022_Science_Park2020           8291
                   Suo_et_al_2022_Science_Popescu2019        9268
                   Suo_et_al_2022_Science_Suo2022            9213
F22_Suo_et_al      Suo_et_al_2022_Science_Park2020             14
                   Suo_et_al_2022_Science_Popescu2019       14721
                   Suo_et_al_2022_Science_Suo2022             391
F23_Suo_et_al      Suo_et_al_2022_Science_Park2020           5509
                   Suo_et_al_2022_Science_Popescu2019        7464
                   Suo_et_al_2022_Science_Suo2022            9445
F29_Suo_et_al      Suo_et_al_2022_Science_Jardine2021       14838
                   Suo_et_al_2022_Science_Park2020          10164
                   

## Library Platform

In [58]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_donor", "integration_library_platform_coarse"]).size()
    display(k[k!=0])

integration_donor  integration_library_platform_coarse
F19_Suo_et_al      3GEX                                    5272
F21_Suo_et_al      3GEX                                   32836
F22_Suo_et_al      3GEX                                   15126
F23_Suo_et_al      3GEX                                   22418
F29_Suo_et_al      3GEX                                   27037
                   5GEX                                   18897
F30_Suo_et_al      3GEX                                   36642
                   5GEX                                   17880
F32_Suo_et_al      3GEX                                   24209
                   5GEX                                    3901
F33_Suo_et_al      3GEX                                   38506
F34_Suo_et_al      3GEX                                   34374
                   5GEX                                    4762
F35_Suo_et_al      3GEX                                   20452
F37_Suo_et_al      3GEX                          

In [59]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_library_platform_coarse", "integration_donor"]).size()
    display(k[k!=0])

integration_library_platform_coarse  integration_donor
3GEX                                 F19_Suo_et_al         5272
                                     F21_Suo_et_al        32836
                                     F22_Suo_et_al        15126
                                     F23_Suo_et_al        22418
                                     F29_Suo_et_al        27037
                                     F30_Suo_et_al        36642
                                     F32_Suo_et_al        24209
                                     F33_Suo_et_al        38506
                                     F34_Suo_et_al        34374
                                     F35_Suo_et_al        20452
                                     F37_Suo_et_al        12729
                                     F38_Suo_et_al        37872
                                     F41_Suo_et_al        28460
                                     F45_Suo_et_al        38684
                                     F66_Suo_et_a

# Organ

In [60]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_donor", "organ"]).size()
    display(k[k!=0])

integration_donor  organ                
F19_Suo_et_al      Liver                       63
                   Skin                      5209
F21_Suo_et_al      Bone_marrow               6064
                   Liver                     9268
                   Spleen                    9213
                   Thymus                    8291
F22_Suo_et_al      Liver                    14721
                   Spleen                     350
                   Thymus                      14
                   Yolk_sac                    41
F23_Suo_et_al      Liver                     7464
                   Spleen                    9445
                   Thymus                    5509
F29_Suo_et_al      Bone_marrow              14838
                   Liver                    10221
                   Spleen                   10711
                   Thymus                   10164
F30_Suo_et_al      Bone_marrow              12130
                   Liver                    18642
         

In [61]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["organ", "integration_donor"]).size()
    display(k[k!=0])

organ                  integration_donor
Bone_marrow            F21_Suo_et_al         6064
                       F29_Suo_et_al        14838
                       F30_Suo_et_al        12130
                       F38_Suo_et_al         3137
                       F41_Suo_et_al        12176
                       F45_Suo_et_al        11538
                       F50_Suo_et_al        12317
                       F51_Suo_et_al        21471
Gut                    F66_Suo_et_al        13745
                       F67_Suo_et_al         8706
                       F72_Suo_et_al        19029
                       F73_Suo_et_al        26093
                       F78_Suo_et_al         5332
Kidney                 F35_Suo_et_al         5106
                       F38_Suo_et_al         4092
                       F41_Suo_et_al         6310
                       F45_Suo_et_al         3546
Liver                  F19_Suo_et_al           63
                       F21_Suo_et_al         9268
         

## Organ and Library Platform

In [62]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["organ", "integration_library_platform_coarse"]).size()
    display(k[k!=0])

organ                  integration_library_platform_coarse
Bone_marrow            3GEX                                    38319
                       5GEX                                    55352
Gut                    3GEX                                    22451
                       5GEX                                    50454
Kidney                 3GEX                                    16092
                       5GEX                                     2962
Liver                  3GEX                                   136339
                       5GEX                                    69835
Mesenteric_lymph_node  5GEX                                     5902
Skin                   3GEX                                    69360
                       5GEX                                    95409
Spleen                 3GEX                                    47270
                       5GEX                                    79097
Thymus                 3GEX                 

In [63]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_library_platform_coarse", "organ"]).size()
    display(k[k!=0])

integration_library_platform_coarse  organ                
3GEX                                 Bone_marrow               38319
                                     Gut                       22451
                                     Kidney                    16092
                                     Liver                    136339
                                     Skin                      69360
                                     Spleen                    47270
                                     Thymus                    53030
                                     Yolk_sac                  14207
5GEX                                 Bone_marrow               55352
                                     Gut                       50454
                                     Kidney                     2962
                                     Liver                     69835
                                     Mesenteric_lymph_node      5902
                                     Skin   

In [64]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["integration_donor", "organ", "integration_library_platform_coarse"]).size()
    display(k[k!=0])

integration_donor  organ                  integration_library_platform_coarse
F19_Suo_et_al      Liver                  3GEX                                      63
                   Skin                   3GEX                                    5209
F21_Suo_et_al      Bone_marrow            3GEX                                    6064
                   Liver                  3GEX                                    9268
                   Spleen                 3GEX                                    9213
                   Thymus                 3GEX                                    8291
F22_Suo_et_al      Liver                  3GEX                                   14721
                   Spleen                 3GEX                                     350
                   Thymus                 3GEX                                      14
                   Yolk_sac               3GEX                                      41
F23_Suo_et_al      Liver                  3GEX      

In [65]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["organ", "integration_library_platform_coarse", "integration_donor"]).size()
    display(k[k!=0])

organ                  integration_library_platform_coarse  integration_donor
Bone_marrow            3GEX                                 F21_Suo_et_al         6064
                                                            F29_Suo_et_al         7925
                                                            F30_Suo_et_al        10259
                                                            F38_Suo_et_al         3137
                                                            F41_Suo_et_al         5804
                                                            F45_Suo_et_al         5130
                       5GEX                                 F29_Suo_et_al         6913
                                                            F30_Suo_et_al         1871
                                                            F41_Suo_et_al         6372
                                                            F45_Suo_et_al         6408
                                                    

## Decision

- Training v01_01:
    - integration_library_platform_coarse
    - integration_donor
    - organ

- Training v01_02:
    - __(covariate)__ integration_library_platform_coarse
    - __(covariate)__ integration_donor
    - organ

- Training v01_03:
    - __(covariate)__ integration_library_platform_coarse
    - integration_donor
    - organ
 
- Training v01_04:
    - integration_library_platform_coarse
    - __(covariate)__ integration_donor
    - organ
 
- Training v01_05:
    - __(covariate)__ covariate integration_library_platform_coarse
    - __(covariate)__ covariate integration_donor
    - __(covariate)__ organ

- Training v02_01:
    - integration_library_platform_coarse
    - __(covariate)__ integration_donor

- Training v02_02:
    - integration_library_platform_coarse
    - integration_donor

- Training v02_03:
    - __(covariate)__ integration_library_platform_coarse
    - integration_donor

- Training v02_04:
    - __(covariate)__ integration_library_platform_coarse
    - __(covariate)__ integration_donor

Use `dataset_complete_Suo.h5ad` anndata object.