### 1. General info of dataset GSE174376

This is the Jupyter Notebook for dataset GSE174376. Its dataset includes barcodes/features/matrix files for each sample.

Thus, we need to simply incorparate these barcodes/features/matrix files and generate an AnnData object for each sample. 
In total, there are 16/18 paediatric Rhabdomyosarcoma samples.

In [13]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes 

1. `barcodes.tsv.gz`: cell barcodes, which go into `.obs`
2. `features.tsv.gz`: gene names, `.var`
3. `matrix.mtx.gz`: the expression matrix, `.X`

In [24]:
# Load the metadata from the paper's Table S1
gsm_to_id = {
    'GSM5293229': 'SJRHB030680_R1',
    'GSM5390456': 'SJRHB031320_D1',
    'GSM5390457': 'SJRHB000026_R2',
    'GSM5390458': 'SJRHB000026_R3',
    'GSM5390459': 'SJRHB010927_D1',
    'GSM5390460': 'SJRHB010928_R1',
    'GSM5390461': 'SJRHB011_D',
    'GSM5390462': 'SJRHB012_R',
    'GSM5390463': 'SJRHB012_S',
    'GSM5390464': 'SJRHB012405_D1',
    'GSM5390465': 'SJRHB013758_D1',
    'GSM5390466': 'SJRHB013758_D2',
    'GSM5390467': 'SJRHB049189_D1',
    'GSM5390469': 'SJRHB010468_D1',
    'GSM5390470': 'SJRHB013757_D2',
    # 'GSM5390471': 'SJRHB013759_A1',
    # 'GSM5390472': 'SJRHB013759_A2',
    'GSM5390473': 'SJRHB046156_A1'
}

age_sex_tissue_metastatic_progress = {
    'SJRHB000026_R2': [4, 'female', 'pelvis', 'primary', 'recurrence'],
    'SJRHB000026_R3': [5, 'female', 'stomach', 'metastatic', 'recurrence'],
    'SJRHB010927_D1': [5, 'female', 'parapharyngeal', 'primary', 'diagnosis'],
    'SJRHB010928_R1': [9, 'male', 'prostate/bladder','primary', 'diagnosis'],
    'SJRHB011_D': [5, 'male', 'infratemporal_fossa','primary', 'recurrence'],
    'SJRHB012_R': [18, 'male', 'prostate','Primaleary   ', 'recurrence'],
    'SJRHB012_S': [18, 'male', 'prostate/bladder','primary', 'recurrence'],
    'SJRHB012405_D1': [8, 'female', 'abdomen/pelvis', 'primary', 'recurrence'],
    'SJRHB013758_D1': [4, 'female', 'pelvis', 'primary', 'diagnosis'],
    'SJRHB013758_D2': [5, 'female', 'abdomen/pelvis', 'primary', 'diagnosis'],
    'SJRHB049189_D1': [0.67, 'male', 'prostate','primary', 'diagnosis'],
    'SJRHB030680_R1': [1.33, 'male', 'lung','metastatic', 'recurrence'],
    'SJRHB010468_D1': [1.08, 'male', 'thigh','primary', 'diagnosis'],
    'SJRHB013757_D2': [3, 'female', 'calf','primary', 'diagnosis'],
    # 'SJRHB013759_A1': [19, 'male', 'chest','metastatic', 'recurrence'],
    # 'SJRHB013759_A2': [19, 'male', 'omentum','metastatic', 'recurrence'],
    'SJRHB046156_A1': [16, 'female', 'breast','metastatic', 'recurrence'],
    'SJRHB031320_D1': [17, 'male', 'abdominal wall','primary', 'diagnosis']
}

<span style="color:red">**PROBLEM:**</span> some of the patient IDs are mismatching between the table S1 and the GEO site

In [25]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE174376')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE174376')

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    sample_name = sample_directory.stem
    sample_h5ad = sample_name + '_uni.h5ad'
    
    gsm = sample_name.split('_')[0]
    patient_id = gsm_to_id[gsm]

    sample = sc.read_10x_mtx(
    sample_directory,
    var_names='gene_symbols',  
    cache=False
    )

    # Create an observation metric info to store related features
    obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
    obs_metrics['cancer_type'] = 'rhabdomyosarcoma'
    obs_metrics['dataset'] = 'GSE174376'
    obs_metrics['age'] = age_sex_tissue_metastatic_progress[patient_id][0]
    obs_metrics['sex'] = age_sex_tissue_metastatic_progress[patient_id][1]
    obs_metrics['tissue'] = age_sex_tissue_metastatic_progress[patient_id][2]
    obs_metrics['metastatic'] = age_sex_tissue_metastatic_progress[patient_id][3]
    obs_metrics['disease_progression'] = age_sex_tissue_metastatic_progress[patient_id][4]
    obs_metrics['sample_barcode'] = sample_name
    obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
    
    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    output_path = write_directory / sample_h5ad
    sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 7328 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 7579 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6943 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6966 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 3947 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progress

In [26]:
sample.var

Unnamed: 0,gene_ids,feature_types
MIR1302-10,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression
...,...,...
AC145205.1,ENSG00000215635,Gene Expression
BAGE5,ENSG00000268590,Gene Expression
CU459201.1,ENSG00000251180,Gene Expression
AC002321.2,ENSG00000215616,Gene Expression


In [27]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,age,sex,tissue,metastatic,disease_progression,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSE174376_AAACCCAAGGGTGGGA-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_AAACCCAAGGGTGGGA-1
GSE174376_AAACCCAAGGTAAACT-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_AAACCCAAGGTAAACT-1
GSE174376_AAACCCACAGGTTTAC-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_AAACCCACAGGTTTAC-1
GSE174376_AAACCCAGTATGCGTT-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_AAACCCAGTATGCGTT-1
GSE174376_AAACCCAGTGTTAACC-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_AAACCCAGTGTTAACC-1
...,...,...,...,...,...,...,...,...,...
GSE174376_TTTGTTGAGTTGCCTA-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_TTTGTTGAGTTGCCTA-1
GSE174376_TTTGTTGAGTTGCGCC-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_TTTGTTGAGTTGCGCC-1
GSE174376_TTTGTTGCATATCGGT-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_TTTGTTGCATATCGGT-1
GSE174376_TTTGTTGGTCCGTTTC-1,rhabdomyosarcoma,GSE174376,18,male,prostate/bladder,primary,recurrence,GSM5390463_1782332_DYE2681,GSE174376_TTTGTTGGTCCGTTTC-1


### 3. Confirmation of created AnnData objects

In [28]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE174376')

# Loop through all files in the directory
for file in write_directory.iterdir():
    if '_uni.h5ad' in file.name:
        sample = anndata.read_h5ad(file)
        print(sample)

AnnData object with n_obs × n_vars = 4791 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6651 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 4754 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6966 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progression', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 7736 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'tissue', 'metastatic', 'disease_progress

In [29]:
sample.var

Unnamed: 0,gene_ids,feature_types
MIR1302-10,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression
...,...,...
AC145205.1,ENSG00000215635,Gene Expression
BAGE5,ENSG00000268590,Gene Expression
CU459201.1,ENSG00000251180,Gene Expression
AC002321.2,ENSG00000215616,Gene Expression


In [30]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,age,sex,tissue,metastatic,disease_progression,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSE174376_AAACCCAAGCCGTTAT-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_AAACCCAAGCCGTTAT-1
GSE174376_AAACCCAAGGTAGTAT-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_AAACCCAAGGTAGTAT-1
GSE174376_AAACCCAAGGTGCTGA-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_AAACCCAAGGTGCTGA-1
GSE174376_AAACCCAAGTGAGTGC-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_AAACCCAAGTGAGTGC-1
GSE174376_AAACCCACAGTTCTAG-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_AAACCCACAGTTCTAG-1
...,...,...,...,...,...,...,...,...,...
GSE174376_TTTGTTGGTGAACGGT-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_TTTGTTGGTGAACGGT-1
GSE174376_TTTGTTGGTTAGGCTT-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_TTTGTTGGTTAGGCTT-1
GSE174376_TTTGTTGTCAACGCTA-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_TTTGTTGTCAACGCTA-1
GSE174376_TTTGTTGTCCATACAG-1,rhabdomyosarcoma,GSE174376,5,female,abdomen/pelvis,primary,diagnosis,GSM5390466_1801662_DYE2837,GSE174376_TTTGTTGTCCATACAG-1


### 4. Convert AnnData objects to SingleCellExperiment objects

In [31]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE174376')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

In [32]:
print(sample_sce)

class: SingleCellExperiment 
dim: 32738 9124 
metadata(0):
assays(1): X
rownames(32738): MIR1302-10 FAM138A ... AC002321.2 AC002321.1
rowData names(2): gene_ids feature_types
colnames(9124): GSE174376_AAACCCAAGCCGTTAT-1
  GSE174376_AAACCCAAGGTAGTAT-1 ... GSE174376_TTTGTTGTCCATACAG-1
  GSE174376_TTTGTTGTCGCCAGTG-1
colData names(9): cancer_type dataset ... sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

