### 1. General info of dataset EGAD00001008345

This is the Jupyter Notebook for dataset EGAD00001008345. The raw sequencing data is not publicaly available, but the processed is downloaded from http://neuroblastomacellatlas.org/

In [82]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. Inspection of processed AnnData objects

In [83]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/EGAD00001008345')

adrenal_path = data_directory / 'EGAD00001008345_processed_healthy_adrenal_10x.h5ad'
nb_GOSH_path = data_directory / 'EGAD00001008345_processed_nb_GOSH_10x.h5ad'
nb_PMC_path = data_directory / 'EGAD00001008345_processed_nb_PMC_CEL-seq2.h5ad'

adrenal = anndata.read_h5ad(adrenal_path)
nb_GOSH = anndata.read_h5ad(nb_GOSH_path)
nb_PMC = anndata.read_h5ad(nb_PMC_path)

In [84]:
adrenal

AnnData object with n_obs × n_vars = 57972 × 33694
    obs: 'GestationalAge', 'SampleName', 'Annotation', 'nCount_RNA', 'nFeature_RNA', 'mtGenes', 'hspGenes', 'riboGenes'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'

In [85]:
adrenal.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RP11-34P13.3,0.000069,0.000069,0.000071,0.975314,False
FAM138A,0.000000,0.000000,0.000000,0.000000,False
OR4F5,0.000000,0.000000,0.000000,0.000000,False
RP11-34P13.7,0.000914,0.000913,0.001016,0.898955,False
RP11-34P13.8,0.000069,0.000069,0.000071,0.975314,False
...,...,...,...,...,...
AC233755.2,0.000000,0.000000,0.000000,0.000000,False
AC233755.1,0.000000,0.000000,0.000000,0.000000,False
AC240274.1,0.045211,0.052446,0.063945,0.820177,False
AC213203.1,0.000134,0.000130,0.000139,0.935009,False


In [86]:
adrenal.obs

Unnamed: 0_level_0,GestationalAge,SampleName,Annotation,nCount_RNA,nFeature_RNA,mtGenes,hspGenes,riboGenes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
babyAdrenal1_AAACCTGAGAATTGTG,w8,w8,Cortex,9274.718256,2617,0.153302,0.011036,0.221010
babyAdrenal1_AAACCTGAGCGATGAC,w8,w8,Mesenchyme,4621.511225,1801,0.039960,0.014135,0.279912
babyAdrenal1_AAACCTGAGCTTCGCG,w8,w8,Cortex,10279.489787,2984,0.097298,0.023925,0.268133
babyAdrenal1_AAACCTGAGGCACATG,w8,w8,Cortex,12099.983510,3218,0.087512,0.014115,0.226316
babyAdrenal1_AAACCTGAGGCGCTCT,w8,w8,Cortex,3811.952597,1549,0.000641,0.016383,0.245097
...,...,...,...,...,...,...,...,...
5698STDY7839917_TTTGTCATCCAGTAGT,w11,w11,Cortex,21364.533076,4712,0.167259,0.021405,0.176720
5698STDY7839917_TTTGTCATCGACCAGC,w11,w11,Sympathoblastic,2066.896650,1404,0.065147,0.012963,0.084298
5698STDY7839917_TTTGTCATCGAGAGCA,w11,w11,Endothelium,10601.142843,3343,0.051403,0.025406,0.196625
5698STDY7839917_TTTGTCATCGCTGATA,w11,w11,Endothelium,6483.847399,2508,0.063000,0.010861,0.198891


In [87]:
adrenal.obs['SampleName'].unique()

['w8', 'w8d6', 'w10d5_1', 'w10d5_2', 'w21_1', 'w21_2', 'w11']
Categories (7, object): ['w8', 'w8d6', 'w10d5_1', 'w10d5_2', 'w11', 'w21_1', 'w21_2']

In [88]:
adrenal.X[:20, 0:20].toarray()

array([[0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.596833, 0.      , 0.      , 0.      ,
        0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.      ,


In [89]:
nb_GOSH

AnnData object with n_obs × n_vars = 6442 × 33428
    obs: 'SampleName', 'Annotation', 'nCount_RNA', 'nFeature_RNA', 'mtGenes', 'hspGenes', 'riboGenes'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'

In [90]:
nb_GOSH.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RP11-34P13.3,0.000310,0.000310,0.000322,0.965392,False
FAM138A,0.000000,0.000000,0.000000,0.000000,False
OR4F5,0.000000,0.000000,0.000000,0.000000,False
RP11-34P13.7,0.004346,0.004328,0.005266,0.821859,False
RP11-34P13.8,0.000466,0.000466,0.000492,0.945943,False
...,...,...,...,...,...
AC233755.2,0.000466,0.000776,0.000492,1.315763,False
AC233755.1,0.000466,0.000776,0.000492,1.315763,False
AC240274.1,0.093604,0.134848,0.144404,0.933825,False
AC213203.1,0.000000,0.000000,0.000000,0.000000,False


In [91]:
nb_GOSH.obs

Unnamed: 0_level_0,SampleName,Annotation,nCount_RNA,nFeature_RNA,mtGenes,hspGenes,riboGenes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
STDY7685340_AAACCTGCACATCCGG,PD42184,Leukocytes,2483.0,1319,2.182771,0.407451,25.291036
STDY7685340_AAACGGGGTTGAACTC,PD42184,Mesenchyme,8944.0,3222,2.719135,0.819118,21.052187
STDY7685340_AAACGGGGTTGTCGCG,PD42184,Mesenchyme,5666.0,2161,2.162021,0.820526,23.417557
STDY7685340_AAACGGGTCCGCTGTT,PD42184,Mesenchyme,3541.0,1646,3.944368,0.433197,14.979480
STDY7685340_AACACGTAGCCCTAAT,PD42184,Endothelium,10206.0,3756,2.879938,0.486476,30.751768
...,...,...,...,...,...,...,...
STDY7787239_TTTATGCAGCGGCTTC,PD43255,Tumour cluster 3,38142.0,6608,2.124105,0.890592,49.290290
STDY7787239_TTTATGCAGCTAACTC,PD43255,Tumour cluster 3,20690.0,5138,2.848579,0.917772,48.363113
STDY7787239_TTTATGCAGCTAGGCA,PD43255,Tumour cluster 3,7237.0,2736,1.743827,0.949074,41.651235
STDY7787239_TTTCCTCAGATGGGTC,PD43255,Tumour cluster 3,2178.0,1091,6.433566,0.811189,32.055944


In [92]:
nb_GOSH.obs['SampleName'].unique()

['PD42184', 'PD42752-1', 'PD42752-2', 'PD46693', 'PD43255']
Categories (5, object): ['PD42184', 'PD42752-1', 'PD42752-2', 'PD43255', 'PD46693']

In [93]:
nb_GOSH.X[:20, :20].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.0170094 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0

In [94]:
nb_PMC

AnnData object with n_obs × n_vars = 13281 × 32383
    obs: 'SampleName', 'Annotation', 'nCount_RNA', 'nFeature_RNA', 'mtGenes', 'hspGenes', 'riboGenes'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_pca', 'X_umap'

In [95]:
nb_PMC.var

Unnamed: 0_level_0,vst.mean,vst.variance,vst.variance.expected,vst.variance.standardized,vst.variable
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TSPAN6,0.146224,0.376358,0.333401,1.128845,False
TNMD,0.002492,0.034723,0.003406,1.354335,False
DPM1,0.241849,0.393462,0.623210,0.631348,False
SCYL3,0.079738,0.130614,0.165969,0.786979,False
C1orf112,0.023266,0.038992,0.042516,0.917107,False
...,...,...,...,...,...
RP11-157J13.1,0.000075,0.000075,0.000075,1.000151,False
LLNLR-245B6.1,0.098712,0.170752,0.211548,0.807157,False
CTD-2060L22.1,0.000753,0.000752,0.000922,0.816466,False
RP11-107E5.4,0.000376,0.000376,0.000434,0.867782,False


In [96]:
nb_PMC.obs

Unnamed: 0_level_0,SampleName,Annotation,nCount_RNA,nFeature_RNA,mtGenes,hspGenes,riboGenes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TM36-C10,NB060,Leukocytes,1234.0,943,4.619125,0.324149,0.405186
TM36-F5,NB060,Leukocytes,4890.6,1796,8.739214,0.756553,11.129514
TM36-F8,NB060,Leukocytes,4256.2,1856,5.469668,0.493398,9.940792
TM36-G4,NB060,Leukocytes,5721.4,3861,2.925857,0.419478,0.489391
TM36-H18,NB060,Leukocytes,6044.5,2375,7.505997,0.545951,8.970138
...,...,...,...,...,...,...,...
TM231-P7,000CGH,Tumour cluster 3,60873.0,8472,13.374402,4.241125,7.341514
TM231-P9,000CGH,Tumour cluster 3,25145.8,6243,10.962865,4.709733,9.706989
TM231-P10,000CGH,Tumour cluster 3,7457.9,3043,10.956167,7.052924,7.483340
TM231-P11,000CGH,Tumour cluster 3,20376.1,5161,11.218535,5.544241,8.231212


In [97]:
nb_PMC.obs['SampleName'].unique()

['NB060', 'NB086', 'NB098', 'NB106', 'NB107', ..., 'NB151', 'NB152', '000CGH', '000FQM', '000GGU']
Length: 16
Categories (16, object): ['000CGH', '000FQM', '000GGU', 'NB060', ..., 'NB132', 'NB138', 'NB151', 'NB152']

In [98]:
nb_PMC.X[:20, :20].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 2.208684  , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.1134151 , 1.1134151 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.2088151 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.5031098 ,
        1.0108093 , 0.        , 0.        , 0.        , 1.8315364 ,
        1.5031098 , 0.        , 0.        , 0

### 3. Rename the metadata in .obs for consistency

In [105]:
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/EGAD00001008345')

adatas = (adrenal, nb_GOSH, nb_PMC)
paths = (adrenal_path, nb_GOSH_path, nb_PMC_path)

for adata, path in zip(adatas, paths):
    adata.var.rename_axis('gene_symbols', inplace=True)
    
    adata.obs.rename(columns={
        'SampleName': 'sample_barcode',	
        'Annotation': 'cell_type_from_paper'
    }, inplace=True
    )
    adata.obs['sample_barcode'] = 'EGAD00001008345_' + adata.obs['sample_barcode'].astype(str)
    adata.obs['uni_barcode'] = adata.obs['sample_barcode'] + '_' + adata.obs.index
    adata.obs.set_index("uni_barcode", drop=False, inplace=True)
    
    # Save the modified AnnData objects
    file_name = path.stem + '_uni.h5ad'
    file_path = write_directory / file_name
    adata.write_h5ad(file_path, compression="gzip")
    

/scratch/user/s4543064/xiaohan-john-project/write/EGAD00001008345/EGAD00001008345_processed_healthy_adrenal_10x_uni.h5ad


In [100]:
adata.obs

Unnamed: 0_level_0,GestationalAge,sample_barcode,cell_type_from_paper,nCount_RNA,nFeature_RNA,mtGenes,hspGenes,riboGenes,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EGAD00001008345_w8_babyAdrenal1_AAACCTGAGAATTGTG,w8,EGAD00001008345_w8,Cortex,9274.718256,2617,0.153302,0.011036,0.221010,EGAD00001008345_w8_babyAdrenal1_AAACCTGAGAATTGTG
EGAD00001008345_w8_babyAdrenal1_AAACCTGAGCGATGAC,w8,EGAD00001008345_w8,Mesenchyme,4621.511225,1801,0.039960,0.014135,0.279912,EGAD00001008345_w8_babyAdrenal1_AAACCTGAGCGATGAC
EGAD00001008345_w8_babyAdrenal1_AAACCTGAGCTTCGCG,w8,EGAD00001008345_w8,Cortex,10279.489787,2984,0.097298,0.023925,0.268133,EGAD00001008345_w8_babyAdrenal1_AAACCTGAGCTTCGCG
EGAD00001008345_w8_babyAdrenal1_AAACCTGAGGCACATG,w8,EGAD00001008345_w8,Cortex,12099.983510,3218,0.087512,0.014115,0.226316,EGAD00001008345_w8_babyAdrenal1_AAACCTGAGGCACATG
EGAD00001008345_w8_babyAdrenal1_AAACCTGAGGCGCTCT,w8,EGAD00001008345_w8,Cortex,3811.952597,1549,0.000641,0.016383,0.245097,EGAD00001008345_w8_babyAdrenal1_AAACCTGAGGCGCTCT
...,...,...,...,...,...,...,...,...,...
EGAD00001008345_w11_5698STDY7839917_TTTGTCATCCAGTAGT,w11,EGAD00001008345_w11,Cortex,21364.533076,4712,0.167259,0.021405,0.176720,EGAD00001008345_w11_5698STDY7839917_TTTGTCATCC...
EGAD00001008345_w11_5698STDY7839917_TTTGTCATCGACCAGC,w11,EGAD00001008345_w11,Sympathoblastic,2066.896650,1404,0.065147,0.012963,0.084298,EGAD00001008345_w11_5698STDY7839917_TTTGTCATCG...
EGAD00001008345_w11_5698STDY7839917_TTTGTCATCGAGAGCA,w11,EGAD00001008345_w11,Endothelium,10601.142843,3343,0.051403,0.025406,0.196625,EGAD00001008345_w11_5698STDY7839917_TTTGTCATCG...
EGAD00001008345_w11_5698STDY7839917_TTTGTCATCGCTGATA,w11,EGAD00001008345_w11,Endothelium,6483.847399,2508,0.063000,0.010861,0.198891,EGAD00001008345_w11_5698STDY7839917_TTTGTCATCG...


In [22]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE227122')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))