### 1. General info of dataset GSE140819

This is the Jupyter Notebook for dataset GSE140819. Its dataset includes a single h5 files for each sample.
. 
In total, there are 39 samples from various origins processed with different technologies. Within the pediatric tumours are:

Neuroblastoma (NB): HTAPP-312-SMP-901, HTAPP-312-SMP-902, HTAPP-656-SMP-3481

(Although there are other paediatric samples in the dataset, they are generated from snRNA-seq)

In [2]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

from pathlib import Path

In [7]:
# inspect the provided h5ad file
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE140819')

adata_path = data_directory / 'GSM4186961_HTAPP-312-SMP-901_fresh-T1_channel1_raw_gene_bc_matrices_h5.h5'
adata = sc.read_10x_h5(adata_path)

adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    var: 'gene_ids'

In [8]:
adata.var

Unnamed: 0,gene_ids
RP11-34P13.3,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945
...,...
AC233755.2,ENSG00000277856
AC233755.1,ENSG00000275063
AC240274.1,ENSG00000271254
AC213203.1,ENSG00000277475


In [9]:
adata.obs

AAACCTGAGAAACCAT-1
AAACCTGAGAAACCGC-1
AAACCTGAGAAACCTA-1
AAACCTGAGAAACGAG-1
AAACCTGAGAAACGCC-1
...
TTTGTCATCTTTACAC-1
TTTGTCATCTTTACGT-1
TTTGTCATCTTTAGGG-1
TTTGTCATCTTTAGTC-1
TTTGTCATCTTTCCTC-1


In [27]:
np.max(adata.X[:100000, :])

1298.0

As such, even the values of the count matrix are float, they are still 'raw'

In [29]:
# inspect the metadata
meta_path = data_directory / 'GSM4186961_metadata_HTAPP-312-SMP-901_fresh-T1_channel1.csv'
meta = pd.read_csv(meta_path, sep='\t', index_col=0)

meta

Unnamed: 0,annotate,nReads,nUMI,nGene,percent_mito,emptydrop,doublet
HTAPP-312-SMP-901_fresh-T1_channel1-AAACCTGAGACGCAAC,Neuroendocrine,2750,1219,761,0.127974,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-AAACCTGAGAGGACGG,Neuroendocrine,2682,1208,704,0.151490,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-AAACCTGAGAGTTGGC,Neuroendocrine,6495,2921,1494,0.050668,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-AAACCTGAGTTAACGA,Neuroendocrine,3205,1425,885,0.028070,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-AAACCTGCACCGCTAG,Neuroendocrine,4655,2017,1058,0.041646,False,False
...,...,...,...,...,...,...,...
HTAPP-312-SMP-901_fresh-T1_channel1-TTTGTCACAATCCAAC,Neuroendocrine,8096,3584,1717,0.042411,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-TTTGTCAGTAAAGTCA,Neuroendocrine,9181,4282,1785,0.076366,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-TTTGTCAGTCAGTGGA,Neuroendocrine,2919,1315,848,0.063878,False,False
HTAPP-312-SMP-901_fresh-T1_channel1-TTTGTCAGTCATGCAT,Neuroendocrine,4909,2275,1185,0.069451,False,False


### 2. AnnData object of each sample

In [5]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE140819_RAW')
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE140819')

def process_file(sample, sample_name, tissue_type, cancer_type):
    sample.obs['cancer_type'] = cancer_type
    sample.obs['dataset'] = 'GSE140819'
    sample.obs['tissue'] = tissue_type
    sample.obs['sample_barcode'] = sample_name
    sample.obs['uni_barcode'] = sample.obs['dataset'] + '_' + sample.obs.index.astype(str)
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    sample_h5ad = sample_name + '_uni.h5ad'
    output_path = write_directory / sample_h5ad
    sample.write_h5ad(output_path, compression="gzip")

process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186965_HTAPP-244-SMP-451_CST_channel1_raw_gene_bc_matrices_h5.h5'),
            'GSM4186965_HTAPP-244-SMP-451_CST',
            'Neuroendocrine', 
            'Neuroblastoma'
        )
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186963_HTAPP-656-SMP-3481_fresh-T1_channel1_raw_gene_bc_matrices_h5.h5'),
            'GSM4186963_HTAPP-656-SMP-3481_fresh-T1',
            'Neuroendocrine', 
            'Neuroblastoma'
        )
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186962_HTAPP-312-SMP-902_fresh-C4-T2_channel1_raw_gene_bc_matrices_h5.h5'),
            'GSM4186962_HTAPP-312-SMP-902_fresh-C4-T2',
            'Neuroendocrine', 
            'Neuroblastoma'
        )
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186961_HTAPP-312-SMP-901_fresh-T1_channel1_raw_gene_bc_matrices_h5.h5'),
            'GSM4186961_HTAPP-312-SMP-901_fresh-T1',
            'Neuroendocrine', 
            'Neuroblastoma'
        )
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186982_HTAPP-443-SMP-5491_CST_channel1_raw_feature_bc_matrix.h5'),
            'GSM4186982_HTAPP-443-SMP-5491_CST',
            'Neuronal', 
            'Glioblastoma'
        )

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'xiaohan-john-project/data/GSE140819_RAW/GSM4186994_HTAPP-951-SMP-4652_TST-V3_channel1_raw_feature_bc_matrix.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [6]:
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186993_HTAPP-951-SMP-4652_TST-V3_channel1_raw_feature_bc_matrix.h5'),
            'GSM4186994_HTAPP-951-SMP-4652_TST-V3', 
            'Mesenchymal', 
            'Sarcoma'
        )
process_file(sc.read_10x_h5('xiaohan-john-project/data/GSE140819_RAW/GSM4186995_HTAPP-975-SMP-4771_TST-V3_channel1_raw_feature_bc_matrix.h5'),
            'GSM4186995_HTAPP-975-SMP-4771_TST-V3', 
            'Mesenchymal', 
            'Sarcoma'
        )

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


### 3. Confirmation of created AnnData objects

In [7]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE140819')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample = anndata.read_h5ad(file)
    print(sample)

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6794880 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'
AnnData object with n_obs × n_vars = 737280 × 33694
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


In [8]:
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE140819')
sample = anndata.read_h5ad(list(write_directory.glob('*'))[0])
sample.obs.head()


  utils.warn_names_duplicates("var")


Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE140819_AAACCTGAGAAACCAT-1,Neuroblastoma,GSE140819,Neuroendocrine,GSM4186963_HTAPP-656-SMP-3481_fresh-T1,GSE140819_AAACCTGAGAAACCAT-1
GSE140819_AAACCTGAGAAACCGC-1,Neuroblastoma,GSE140819,Neuroendocrine,GSM4186963_HTAPP-656-SMP-3481_fresh-T1,GSE140819_AAACCTGAGAAACCGC-1
GSE140819_AAACCTGAGAAACCTA-1,Neuroblastoma,GSE140819,Neuroendocrine,GSM4186963_HTAPP-656-SMP-3481_fresh-T1,GSE140819_AAACCTGAGAAACCTA-1
GSE140819_AAACCTGAGAAACGAG-1,Neuroblastoma,GSE140819,Neuroendocrine,GSM4186963_HTAPP-656-SMP-3481_fresh-T1,GSE140819_AAACCTGAGAAACGAG-1
GSE140819_AAACCTGAGAAACGCC-1,Neuroblastoma,GSE140819,Neuroendocrine,GSM4186963_HTAPP-656-SMP-3481_fresh-T1,GSE140819_AAACCTGAGAAACGCC-1


In [9]:
sample.var.head()

Unnamed: 0,gene_ids
RP11-34P13.3,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945


### 4. Convert AnnData objects to SingleCellExperiment objects

In [30]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE140819')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

  utils.warn_names_duplicates("var")
       'C11orf71', 'C17orf100', 'C17orf100', 'C1QTNF9B-AS1', 'C1QTNF9B-AS1',
       'CAPS', 'CAPS', 'CCDC7', 'CCDC7', 'CFAP99', 'CFAP99', 'CRHR1', 'CRHR1',
       'CYB561D2', 'CYB561D2', 'ECSCR', 'ECSCR', 'FAM231C', 'FAM231C', 'FBF1',
       'FBF1', 'IGHV2-70', 'IGHV2-70', 'LINC00864', 'LINC00864', 'LINC01297',
       'LINC01297', 'LINC01422', 'LINC01422', 'LINC01481', 'LINC01481',
       'MATR3', 'MATR3', 'PABPC1L2B-AS1', 'PABPC1L2B-AS1', 'PGM5-AS1',
       'PGM5-AS1', 'PIK3R3', 'PIK3R3', 'PRICKLE2-AS1', 'PRICKLE2-AS1',
       'PRICKLE4', 'PRICKLE4', 'RABL6', 'RABL6', 'RAET1E-AS1', 'RAET1E-AS1',
       'RGS5', 'RGS5', 'SERPINA3', 'SERPINA3', 'SPATA13', 'SPATA13', 'TBC1D26',
       'TBC1D26', 'TIMM10B', 'TIMM10B', 'TMBIM4', 'TMBIM4', 'TMEM256-PLSCR3',
       'TMEM256-PLSCR3'],
      dtype='object')
  return dispatch(args[0].__class__)(*args, **kw)
  utils.warn_names_duplicates("var")
       'C11orf71', 'C17orf100', 'C17orf100', 'C1QTNF9B-AS1', 'C1QT

In [31]:
print(sample_sce)

class: SingleCellExperiment 
dim: 33694 737280 
metadata(0):
assays(1): X
rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
rowData names(1): gene_ids
colnames(737280): GSE140819_AAACCTGAGAAACCAT-1
  GSE140819_AAACCTGAGAAACCGC-1 ... GSE140819_TTTGTCATCTTTAGTC-1
  GSE140819_TTTGTCATCTTTCCTC-1
colData names(5): cancer_type dataset tissue sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

