### 1. General info of dataset GSE140819

This is the Jupyter Notebook for dataset GSE140819. Its dataset includes a single h5 files for each sample.
. 
In total, there are 40 samples from various origins processed with different technologies. Within the pediatric tumours are:

Neuroblastoma (NB): HTAPP-312-SMP-901, HTAPP-312-SMP-902, HTAPP-656-SMP-3481, HTAPP-244-SMP-451 (nuclei), HTAPP-656-SMP-3481 (nuclei)

Glioblastoma (GB): HTAPP-443-SMP-5491

Sarcoma: HTAPP-951-SMP-4652 (nuclei), HTAPP-975-SMP-4771 (nuclei)


In [None]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

from pathlib import Path

In [17]:
# inspect the provided h5ad file
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE140819')

adata_path = data_directory / 'GSM4186961_HTAPP-312-SMP-901_fresh-T1_channel1_raw_gene_bc_matrices_h5.h5'
adata = sc.read_10x_h5(adata_path)

adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    var: 'gene_ids'

In [18]:
adata.var

Unnamed: 0,gene_ids
RP11-34P13.3,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945
...,...
AC233755.2,ENSG00000277856
AC233755.1,ENSG00000275063
AC240274.1,ENSG00000271254
AC213203.1,ENSG00000277475


In [19]:
adata.obs

AAACCTGAGAAACCAT-1
AAACCTGAGAAACCGC-1
AAACCTGAGAAACCTA-1
AAACCTGAGAAACGAG-1
AAACCTGAGAAACGCC-1
...
TTTGTCATCTTTACAC-1
TTTGTCATCTTTACGT-1
TTTGTCATCTTTAGGG-1
TTTGTCATCTTTAGTC-1
TTTGTCATCTTTCCTC-1


In [20]:
np.max(adata.X[:100000, :])

1298.0

In [28]:
np.max(adata.X[:500, :])

162.0

As such, even the values of the count matrix are float, they are still 'raw'

In [43]:
# inspect the metadata
meta_path = data_directory / 'GSM4186992_metadata_HTAPP-951-SMP-4652_TST-V2_channel1.csv'
meta = pd.read_csv(meta_path, sep=',', index_col=0)

meta

Unnamed: 0,annotate,nUMI,nGene,percent_mito,doublet
HTAPP-951-SMP-4652TST-V2channel1_AAACCTGAGACCTTTG,Skeletal myoblast,1467,1045,0.038173,False
HTAPP-951-SMP-4652TST-V2channel1_AAACCTGAGTCAAGCG,Skeletal myoblast,2021,1349,0.067293,False
HTAPP-951-SMP-4652TST-V2channel1_AAACCTGCAGACGCAA,Skeletal myoblast,720,590,0.025000,False
HTAPP-951-SMP-4652TST-V2channel1_AAACCTGCAGGTCCAC,Fibroblast,1453,912,0.021335,False
HTAPP-951-SMP-4652TST-V2channel1_AAACCTGGTCATGCCG,Skeletal myoblast,1332,879,0.126126,False
...,...,...,...,...,...
HTAPP-951-SMP-4652TST-V2channel1_TTTGTCACAATCGGTT,Skeletal myoblast,1433,1082,0.023029,False
HTAPP-951-SMP-4652TST-V2channel1_TTTGTCAGTACTTCTT,Skeletal myoblast,3602,1836,0.035536,False
HTAPP-951-SMP-4652TST-V2channel1_TTTGTCAGTCAGAGGT,Skeletal myoblast,2129,1267,0.169563,False
HTAPP-951-SMP-4652TST-V2channel1_TTTGTCATCCAGAAGG,Skeletal myoblast,661,582,0.006051,False


### 2. AnnData object of each sample

Neuroblastoma (NB): HTAPP-312-SMP-901, HTAPP-312-SMP-902, HTAPP-656-SMP-3481, HTAPP-244-SMP-451 (nuclei), HTAPP-656-SMP-3481 (nuclei)

Glioblastoma (GB): HTAPP-443-SMP-5491

Sarcoma: HTAPP-951-SMP-4652 (nuclei), HTAPP-975-SMP-4771 (nuclei)

In [44]:
# Load the metadata from Figure 1b
cancer_tissue_dict = {
    'HTAPP-312-SMP-901': ['neuroblastoma', 'neuroendocrine'],
    'HTAPP-312-SMP-902': ['neuroblastoma', 'neuroendocrine'],
    'HTAPP-656-SMP-3481': ['neuroblastoma', 'neuroendocrine'],
    'HTAPP-244-SMP-451': ['neuroblastoma', 'neuroendocrine'],
    'HTAPP-443-SMP-5491': ['glioblastoma', 'neuronal'],
    'HTAPP-951-SMP-4652': ['sarcoma', 'mesenchymal'],
    'HTAPP-975-SMP-4771': ['sarcoma', 'mesenchymal'],
}


In [50]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE140819')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE140819')

for sample in data_directory.iterdir():
    if '.h5' in sample.name:
        # Get the gsm and patient id
        gsm = sample.stem.split("_")[0]
        gsm_patient_id = sample.stem.split("_raw_")[0]
        patient_id = gsm_patient_id.split('_')[1]

        adata = sc.read_10x_h5(sample)

        adata.obs['cancer_type'] = cancer_tissue_dict[patient_id][0]
        adata.obs['dataset'] = 'GSE140819'
        adata.obs['tissue'] = cancer_tissue_dict[patient_id][1]
        adata.obs['sample_barcode'] = gsm_patient_id
        adata.obs['uni_barcode'] = adata.obs['dataset'] + '_' + adata.obs.index.astype(str)
        adata.obs.set_index("uni_barcode", drop=False, inplace=True)
        print(adata)

        # save the anndata object
        sample_h5ad = gsm_patient_id + '_uni.h5ad'
        output_path = write_directory / sample_h5ad
        adata.write_h5ad(output_path, compression="gzip")
        

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


### 3. Confirmation of created AnnData objects

In [52]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE140819')

# Loop through all files in the directory
for file in write_directory.iterdir():
    if '_uni.h5ad' in file.name:
        sample = anndata.read_h5ad(file)
        print(sample)

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


In [53]:
sample.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38-3.0.0_premrna
FAM138A,ENSG00000237613,Gene Expression,GRCh38-3.0.0_premrna
OR4F5,ENSG00000186092,Gene Expression,GRCh38-3.0.0_premrna
AL627309.1,ENSG00000238009,Gene Expression,GRCh38-3.0.0_premrna
AL627309.3,ENSG00000239945,Gene Expression,GRCh38-3.0.0_premrna
...,...,...,...
AC233755.2,ENSG00000277856,Gene Expression,GRCh38-3.0.0_premrna
AC233755.1,ENSG00000275063,Gene Expression,GRCh38-3.0.0_premrna
AC240274.1,ENSG00000271254,Gene Expression,GRCh38-3.0.0_premrna
AC213203.1,ENSG00000277475,Gene Expression,GRCh38-3.0.0_premrna


In [54]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE140819_AAACCCAAGAAACACT-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_AAACCCAAGAAACACT-1
GSE140819_AAACCCAAGAAACCAT-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_AAACCCAAGAAACCAT-1
GSE140819_AAACCCAAGAAACCCA-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_AAACCCAAGAAACCCA-1
GSE140819_AAACCCAAGAAACCCG-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_AAACCCAAGAAACCCG-1
GSE140819_AAACCCAAGAAACCTG-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_AAACCCAAGAAACCTG-1
...,...,...,...,...,...
GSE140819_TTTGTTGTCTTTGCTA-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_TTTGTTGTCTTTGCTA-1
GSE140819_TTTGTTGTCTTTGCTG-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_TTTGTTGTCTTTGCTG-1
GSE140819_TTTGTTGTCTTTGGAG-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_TTTGTTGTCTTTGGAG-1
GSE140819_TTTGTTGTCTTTGGCT-1,sarcoma,GSE140819,mesenchymal,GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1,GSE140819_TTTGTTGTCTTTGGCT-1


### 4. Convert AnnData objects to SingleCellExperiment objects

In [55]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE140819')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

  utils.warn_names_duplicates("var")
       'C11orf71', 'C17orf100', 'C17orf100', 'C1QTNF9B-AS1', 'C1QTNF9B-AS1',
       'CAPS', 'CAPS', 'CCDC7', 'CCDC7', 'CFAP99', 'CFAP99', 'CRHR1', 'CRHR1',
       'CYB561D2', 'CYB561D2', 'ECSCR', 'ECSCR', 'FAM231C', 'FAM231C', 'FBF1',
       'FBF1', 'IGHV2-70', 'IGHV2-70', 'LINC00864', 'LINC00864', 'LINC01297',
       'LINC01297', 'LINC01422', 'LINC01422', 'LINC01481', 'LINC01481',
       'MATR3', 'MATR3', 'PABPC1L2B-AS1', 'PABPC1L2B-AS1', 'PGM5-AS1',
       'PGM5-AS1', 'PIK3R3', 'PIK3R3', 'PRICKLE2-AS1', 'PRICKLE2-AS1',
       'PRICKLE4', 'PRICKLE4', 'RABL6', 'RABL6', 'RAET1E-AS1', 'RAET1E-AS1',
       'RGS5', 'RGS5', 'SERPINA3', 'SERPINA3', 'SPATA13', 'SPATA13', 'TBC1D26',
       'TBC1D26', 'TIMM10B', 'TIMM10B', 'TMBIM4', 'TMBIM4', 'TMEM256-PLSCR3',
       'TMEM256-PLSCR3'],
      dtype='object')
  return dispatch(args[0].__class__)(*args, **kw)
  utils.warn_names_duplicates("var")
       'C11orf71', 'C17orf100', 'C17orf100', 'C1QTNF9B-AS1', 'C1QT

In [31]:
print(sample_sce)

class: SingleCellExperiment 
dim: 33694 737280 
metadata(0):
assays(1): X
rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
rowData names(1): gene_ids
colnames(737280): GSE140819_AAACCTGAGAAACCAT-1
  GSE140819_AAACCTGAGAAACCGC-1 ... GSE140819_TTTGTCATCTTTAGTC-1
  GSE140819_TTTGTCATCTTTCCTC-1
colData names(5): cancer_type dataset tissue sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

