In [1]:
import scanpy as sc
from sklearn.model_selection import train_test_split
DATA_DIR = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/till-data/'


In [None]:
adata_raw = sc.read_h5ad(DATA_DIR + 'local.h5ad')
adata_raw

In [None]:
train, test = train_test_split(adata_raw.obs_names, test_size=0.2, random_state=0)
val, test = train_test_split(test, test_size=0.5, random_state=0)
train_adata = adata_raw[adata_raw.obs_names.isin(train), :]
train_adata.write(DATA_DIR + 'train_adata_unnormalized.h5ad')
val_adata = adata_raw[adata_raw.obs_names.isin(val), :]
val_adata.write(DATA_DIR + 'val_adata_unnormalized.h5ad')
test_adata = adata_raw[adata_raw.obs_names.isin(test), :]
test_adata.write(DATA_DIR + 'test_adata_unnormalized.h5ad')
train_adata
del adata_raw, val_adata, test_adata, train_adata

### Train adata

In [2]:
train_adata = sc.read_h5ad(DATA_DIR + 'train_adata_unnormalized.h5ad')
sc.pp.filter_genes(train_adata, min_counts=3)
sc.pp.filter_cells(train_adata, min_counts=3)
train_adata.layers["counts"] = train_adata.X.copy()
sc.pp.normalize_total(train_adata, target_sum=10000)
sc.pp.log1p(train_adata)
train_adata.raw = train_adata
sc.pp.highly_variable_genes(
    train_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="dataset",
    subset=True,
)
train_adata



AnnData object with n_obs × n_vars = 467955 × 2000
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_level_1', 'o

In [3]:
train_adata.write(DATA_DIR + 'train_adata.h5ad')
del train_adata

### Val adata

In [4]:
val_adata = sc.read_h5ad(DATA_DIR + 'val_adata_unnormalized.h5ad')
sc.pp.filter_genes(val_adata, min_counts=3)
sc.pp.filter_cells(val_adata, min_counts=3)
val_adata.layers["counts"] = val_adata.X.copy()
sc.pp.normalize_total(val_adata, target_sum=10000)
sc.pp.log1p(val_adata)
val_adata.raw = val_adata
sc.pp.highly_variable_genes(
    val_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="dataset",
    subset=True,
)



In [5]:
val_adata.write(DATA_DIR + 'val_adata.h5ad')
del val_adata

### Test adata

In [6]:
test_adata = sc.read_h5ad(DATA_DIR + 'test_adata_unnormalized.h5ad')
sc.pp.filter_genes(test_adata, min_counts=3)
sc.pp.filter_cells(test_adata, min_counts=3)
test_adata.layers["counts"] = test_adata.X.copy()
sc.pp.normalize_total(test_adata, target_sum=10000)
sc.pp.log1p(test_adata)
test_adata.raw = test_adata
sc.pp.highly_variable_genes(
    test_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="dataset",
    subset=True,
)
test_adata



AnnData object with n_obs × n_vars = 58495 × 2000
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'dataset', 'entropy_dataset_leiden_3', 'entropy_original_ann_level_1_leiden_3', 'entropy_original_ann_level_2_clean_leiden_3', 'entropy_original_ann_level_3_clean_leiden_3', 'entropy_subject_ID_leiden_3', 'fresh_or_frozen', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_5', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'n_genes_detected', 'original_ann_highest_res', 'original_ann_level_1', 'or

In [7]:
test_adata.write(DATA_DIR + 'test_adata.h5ad')
del test_adata