### 1. General info of dataset GSE141460

This is the Jupyter Notebook for dataset GSE141460. Its dataset includes barcodes/genes/matrix files for each sample. There are 5/6 paediatric samples.

In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes

1. `barcodes.tsv`: cell barcodes, which go into `.obs`
2. `genes.tsv`: gene names, `.var`
3. `matrix.mtx`: the expression matrix, `.X`

In [6]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE141460')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE141460')

tissue_subgroup_age_sex_progression_dict = {'MUV014': ['medulla_oblongata', 'PFA-1a', 1.9, 'male', 'recurrence'],
                                            'MUV013': ['posterior_fossa', 'PFA-2a', 2.2, 'female', 'diagnosis'],
                                            'MUV051': ['medulla_oblongata', 'PFA-2b', 2.8, 'male', 'recurrence'],
                                            'MUV056': ['temporal_right', 'ST-RELA', 8.6, 'male', 'recurrence'],
                                            'MUV006': ['parietal_left', 'ST-YAP1', 0.3, 'female', 'diagnosis']}

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    sample_name = sample_directory.stem
    sample_name_muv = sample_name.split('_')[1]
    sample_h5ad = sample_name + '_uni.h5ad'

    if sample_name_muv in tissue_subgroup_age_sex_progression_dict:
        sample = sc.read_10x_mtx(
            sample_directory,
            var_names='gene_symbols',  
            cache=False
        )

        # Create an observation metric info to store related features
        obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
        obs_metrics['cancer_type'] = tissue_subgroup_age_sex_progression_dict[sample_name_muv][1] + '_ependymoma'
        obs_metrics['dataset'] = 'GSE141460'
        obs_metrics['tissue'] = tissue_subgroup_age_sex_progression_dict[sample_name_muv][0]
        obs_metrics['age'] = tissue_subgroup_age_sex_progression_dict[sample_name_muv][2]
        obs_metrics['sex'] = tissue_subgroup_age_sex_progression_dict[sample_name_muv][3]
        obs_metrics['disease_progression'] = tissue_subgroup_age_sex_progression_dict[sample_name_muv][4]
        obs_metrics['sample_barcode'] = sample_name
        obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
        
        sample.obs = obs_metrics
        sample.obs.set_index("uni_barcode", drop=False, inplace=True)
        print(sample)

        # save the anndata object
        output_path = write_directory.joinpath(sample_h5ad)
        sample.write_h5ad(output_path, compression="gzip")



AnnData object with n_obs × n_vars = 21968 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'




AnnData object with n_obs × n_vars = 3193 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'




AnnData object with n_obs × n_vars = 4015 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'




AnnData object with n_obs × n_vars = 15906 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'




AnnData object with n_obs × n_vars = 10529 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


In [7]:
sample.var

Unnamed: 0,gene_ids
DDX11L1,ENSG00000223972.5
WASH7P,ENSG00000227232.5
MIR6859-1,ENSG00000278267.1
MIR1302-2HG,ENSG00000243485.5
MIR1302-2,ENSG00000284332.1
...,...
AC007325.1,ENSG00000276017.1
AC007325.4,ENSG00000278817.1
AC007325.2,ENSG00000277196.4
U6-36,ENSG00000278625.1


In [8]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,age,sex,disease_progression,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE141460_AAACCCAAGCGACTAG-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_AAACCCAAGCGACTAG-1
GSE141460_AAACCCAAGGAGCTGT-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_AAACCCAAGGAGCTGT-1
GSE141460_AAACCCAAGTGGCGAT-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_AAACCCAAGTGGCGAT-1
GSE141460_AAACCCACACGTAGAG-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_AAACCCACACGTAGAG-1
GSE141460_AAACCCACAGGTGTGA-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_AAACCCACAGGTGTGA-1
...,...,...,...,...,...,...,...,...
GSE141460_TTTGTTGCATTCATCT-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_TTTGTTGCATTCATCT-1
GSE141460_TTTGTTGGTAGGCTGA-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_TTTGTTGGTAGGCTGA-1
GSE141460_TTTGTTGGTCCTACAA-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_TTTGTTGGTCCTACAA-1
GSE141460_TTTGTTGTCGCGCTGA-1,PFA-1a_ependymoma,GSE141460,medulla_oblongata,1.9,male,recurrence,GSM5273079_MUV014,GSE141460_TTTGTTGTCGCGCTGA-1


### 3. Confirmation of created AnnData objects

In [9]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE141460')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample = anndata.read_h5ad(file)
    print(sample)

AnnData object with n_obs × n_vars = 15906 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


AnnData object with n_obs × n_vars = 21968 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 3193 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 4015 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 10529 × 58347
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


### 4. Convert AnnData objects to SingleCellExperiment objects

In [12]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE141460')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        # print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

In [13]:
print(sample_sce)

class: SingleCellExperiment 
dim: 58347 10529 
metadata(0):
assays(1): X
rownames(58347): DDX11L1 WASH7P ... U6-36 U1-15
rowData names(1): gene_ids
colnames(10529): GSE141460_AAACCCAAGCGACTAG-1
  GSE141460_AAACCCAAGGAGCTGT-1 ... GSE141460_TTTGTTGTCGCGCTGA-1
  GSE141460_TTTGTTGTCTGTCAGA-1
colData names(8): cancer_type dataset ... sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

