# Suo

In [1]:
import warnings
import os
import sys
import gc
import warnings

In [2]:
import anndata as ad
import scanpy as sc
import copy
import torch
from pathlib import Path
import networkx as nx
from sklearn.neighbors import kneighbors_graph
import numpy as np
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.stats

sys.path.append("/home/icb/kemal.inecik/work/codes/tardis")
import tardis
tardis.config = tardis.config_server

In [3]:
print(f"CUDA used: {torch.cuda.is_available()}")

CUDA used: False


In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
_rcparams_path = "/home/icb/kemal.inecik/work/codes/tardis/training/local/figures/rcparams.pickle"
with open(_rcparams_path, 'rb') as file:
    _rcparams = pickle.load(file)
plt.rcParams.update(_rcparams)

In [5]:
adata_file_path = os.path.join(tardis.config.io_directories["processed"], "dataset_complete_Suo.h5ad")
assert os.path.isfile(adata_file_path), f"File not already exist: `{adata_file_path}`"
adata = ad.read_h5ad(adata_file_path)
print(adata_file_path, flush=True)
print(adata, flush=True)

/lustre/groups/ml01/workspace/kemal.inecik/tardis_data/processed/dataset_complete_Suo.h5ad


AnnData object with n_obs × n_vars = 841922 × 8192
    obs: 'sample_ID', 'organ', 'age', 'cell_type', 'sex', 'sex_inferred', 'concatenated_integration_covariates', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'n_genes'
    uns: 'rank_genes_groups'
    obsm: 'Unintegrated', 'X_pca', 'harmony'


In [6]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["organ", "integration_library_platform_coarse"]).size()
    display(k)

organ                  integration_library_platform_coarse
Bone_marrow            3GEX                                    38316
                       5GEX                                    55351
Gut                    3GEX                                    22450
                       5GEX                                    50454
Kidney                 3GEX                                    16092
                       5GEX                                     2962
Liver                  3GEX                                   136327
                       5GEX                                    69829
Mesenteric_lymph_node  3GEX                                        0
                       5GEX                                     5902
Skin                   3GEX                                    69359
                       5GEX                                    95406
Spleen                 3GEX                                    47264
                       5GEX                 

In [7]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["organ", "age", "integration_donor"]).size()
    display(k[k!=0])

organ                  age   integration_donor
Bone_marrow            12.0  F45_Suo_et_al        11538
                       13.0  F38_Suo_et_al         3135
                       14.0  F30_Suo_et_al        12129
                             F51_Suo_et_al        21471
                       15.0  F50_Suo_et_al        12317
                       16.0  F21_Suo_et_al         6064
                             F41_Suo_et_al        12175
                       17.0  F29_Suo_et_al        14838
Gut                    12.0  F67_Suo_et_al         8706
                       15.0  F66_Suo_et_al        13744
                             F73_Suo_et_al        26093
                       16.0  F72_Suo_et_al        19029
                       17.0  F78_Suo_et_al         5332
Kidney                 7.0   F35_Suo_et_al         5106
                       12.0  F45_Suo_et_al         3546
                       13.0  F38_Suo_et_al         4092
                       16.0  F41_Suo_et_al         6310
L

In [8]:
adata = adata[
    (adata.obs["organ"] == "Liver ") 
    & (adata.obs["integration_library_platform_coarse"] == "3GEX")
    & (adata.obs["sex"] == "F")
]

In [9]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["age", "sex", "integration_donor"]).size()
    display(k[k!=0])

age   sex  integration_donor
7.0   F    F32_Suo_et_al         8483
           F35_Suo_et_al         6667
8.0   F    F34_Suo_et_al        19269
9.0   F    F22_Suo_et_al        14719
           F33_Suo_et_al        23815
10.0  F    F19_Suo_et_al           63
12.0  F    F45_Suo_et_al         6255
16.0  F    F41_Suo_et_al         9700
17.0  F    F29_Suo_et_al         7630
dtype: int64

In [10]:
donors = adata.obs.drop_duplicates(["age", "sex"])["integration_donor"].to_list()
adata = adata[
    (adata.obs["integration_donor"].isin(donors)) 
    & (adata.obs["age"] != 10)
]

In [11]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["age", "sex", "integration_donor", "integration_library_platform_coarse", "organ", "concatenated_integration_covariates"]).size()
    display(k[k!=0])

age   sex  integration_donor  integration_library_platform_coarse  organ   concatenated_integration_covariates
7.0   F    F32_Suo_et_al      3GEX                                 Liver   F32_Suo_et_al_Cell_Fresh_3GEX           8483
8.0   F    F34_Suo_et_al      3GEX                                 Liver   F34_Suo_et_al_Cell_Fresh_3GEX          19269
9.0   F    F33_Suo_et_al      3GEX                                 Liver   F33_Suo_et_al_Cell_Fresh_3GEX          23815
12.0  F    F45_Suo_et_al      3GEX                                 Liver   F45_Suo_et_al_Cell_Fresh_3GEX           6255
16.0  F    F41_Suo_et_al      3GEX                                 Liver   F41_Suo_et_al_Cell_Fresh_3GEX           9700
17.0  F    F29_Suo_et_al      3GEX                                 Liver   F29_Suo_et_al_Cell_Fresh_3GEX           7630
dtype: int64

In [12]:
df = adata.obs.copy()
group_sizes = df.groupby(["cell_type", "age"]).size()
valid_groups = group_sizes[group_sizes > 30].index
filtered_bool = df.set_index(["cell_type", "age"]).index.isin(valid_groups)
print(filtered_bool.sum())
adata = adata[filtered_bool]

73697


In [13]:
with pd.option_context('display.max_rows', None):
    k = adata.obs.groupby(["cell_type", "age"]).size()
    display(k[k>0])

cell_type                              age 
B1 cells                               9.0       37
                                       16.0      34
CD4 T                                  16.0     173
                                       17.0      88
CD5- mature B                          9.0       59
                                       16.0     111
                                       17.0     156
CD8 T                                  16.0      46
CMP                                    8.0       53
                                       9.0       48
                                       12.0      38
Cycling DC                             7.0       44
                                       8.0       96
                                       9.0       41
                                       12.0      39
                                       16.0      49
                                       17.0      32
Cycling ILC                            8.0       69
                    

In [14]:
adata

View of AnnData object with n_obs × n_vars = 73697 × 8192
    obs: 'sample_ID', 'organ', 'age', 'cell_type', 'sex', 'sex_inferred', 'concatenated_integration_covariates', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'n_genes'
    uns: 'rank_genes_groups'
    obsm: 'Unintegrated', 'X_pca', 'harmony'

In [15]:
adata_write_path = os.path.join(tardis.config.io_directories["processed"], "dataset_complete_Suo_age.h5ad")
adata.write_h5ad(adata_write_path)