### 1. General info of dataset GSE216176

This is the Jupyter Notebook for dataset GSE216176. Its dataset includes a single h5 files for each sample.
. 
In total, there are 17 Bone Marrow samples with and without neuroblastoma infiltration

<span style="color:green">C</span> Control bone marrow

<span style="color:green">M</span> MYCN-amplified neuroblastoma infiltrated

<span style="color:green">A</span> ATRX-amplified neuroblastoma infiltrated

<span style="color:green">S</span> Sporadic (lacking MYCN or ATRX) neuroblastoma infiltrated

In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

In [7]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE216176_RAW')
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE216176')

# Loop through all files in the directory
for sample_file in data_directory.iterdir():
    sample_name = sample_file.stem
    sample_h5ad = sample_name + '_uni.h5ad'

    sample = sc.read_10x_h5(
    sample_file,
    )

    # Create an observation metric info to store related features
    sample.obs['cancer_type'] = 'Neuroblastoma'
    sample.obs['dataset'] = 'GSE216176'
    sample.obs['tissue'] = 'bone_marrow'
    sample.obs['sample_barcode'] = sample_name
    sample.obs['uni_barcode'] = sample.obs['dataset'] + '_' + sample.obs.index.astype(str)
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    output_path = write_directory / sample_h5ad
    sample.write_h5ad(output_path, compression="gzip")

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 4417 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6268 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5874 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1763 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6668 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6257 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6178 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 3468 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 12417 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11074 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5832 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5539 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5195 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 4636 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 7903 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 7665 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5710 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


### 3. Confirmation of created AnnData objects

In [8]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE216176')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample = anndata.read_h5ad(file)
    print(sample)

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 4636 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6257 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 7903 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 12417 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11074 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6668 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6178 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5195 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5832 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'
AnnData object with n_obs × n_vars = 1763 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5710 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 4417 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 3468 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5539 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 7665 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 6268 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'
AnnData object with n_obs × n_vars = 5874 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


In [13]:
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE216176')
sample = anndata.read_h5ad(list(write_directory.glob('*'))[0])
sample.obs.head()


  utils.warn_names_duplicates("var")


Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE216176_AAACCCAAGACGACTG-1,Neuroblastoma,GSE216176,bone_marrow,GSM6659425_A2_filtered_feature_bc_matrix,GSE216176_AAACCCAAGACGACTG-1
GSE216176_AAACCCAAGACTCATC-1,Neuroblastoma,GSE216176,bone_marrow,GSM6659425_A2_filtered_feature_bc_matrix,GSE216176_AAACCCAAGACTCATC-1
GSE216176_AAACCCACACGTCGTG-1,Neuroblastoma,GSE216176,bone_marrow,GSM6659425_A2_filtered_feature_bc_matrix,GSE216176_AAACCCACACGTCGTG-1
GSE216176_AAACCCAGTCCCGTGA-1,Neuroblastoma,GSE216176,bone_marrow,GSM6659425_A2_filtered_feature_bc_matrix,GSE216176_AAACCCAGTCCCGTGA-1
GSE216176_AAACGAAGTGGCAGAT-1,Neuroblastoma,GSE216176,bone_marrow,GSM6659425_A2_filtered_feature_bc_matrix,GSE216176_AAACGAAGTGGCAGAT-1


In [14]:
sample.var.head()

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38


### 4. Convert AnnData objects to SingleCellExperiment objects

In [22]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE227122')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))