### 1. General info of dataset GSE195709

This is the Jupyter Notebook for dataset GSE195709. Its dataset includes barcodes/features/matrix files for each sample.

Thus, we need to simply incorparate these barcodes/features/matrix files and generate an AnnData object for each sample.
In total, there are 4 Rhabdomyosarcoma samples.


In [3]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes

1. `barcodes.tsv.gz`: cell barcodes, which go into `.obs`
2. `features.tsv.gz`: gene names, `.var`
3. `matrix.mtx.gz`: the expression matrix, `.X`


In [4]:
# Load the metadata from the paper's Supplementary Table 1
age_sex_recurrent_tissue = {
    '20696': [0.5, 'male', 'primary', 'lung'],
    '21202': [1, 'male', 'recurrent', 'lung'],
    '29806': [15, 'male', 'recurrent', 'unknown'],
    '20082': [17, 'female', 'metastatic', 'unknown']
}

In [5]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE195709')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE195709')

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    if '_hg19' in sample_directory.name:
        sample_name = sample_directory.stem
        sample_h5ad = sample_name + '_uni.h5ad'
        sample_id = sample_name.split('_')[1]

        sample = sc.read_10x_mtx(
            sample_directory,
            var_names='gene_symbols',  
            cache=False
        )

        # Create an observation metric info to store related features
        obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
        obs_metrics['cancer_type'] = 'rhabdomyosarcoma'
        obs_metrics['dataset'] = 'GSE195709'
        obs_metrics['age'] = age_sex_recurrent_tissue[sample_id][0]
        obs_metrics['sex'] = age_sex_recurrent_tissue[sample_id][1]
        obs_metrics['recurrent'] = age_sex_recurrent_tissue[sample_id][2]
        obs_metrics['tissue'] = age_sex_recurrent_tissue[sample_id][3]
        obs_metrics['sample_barcode'] = sample_name
        obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
        
        sample.obs = obs_metrics
        sample.obs.set_index("uni_barcode", drop=False, inplace=True)
        print(sample)

        # save the anndata object
        output_path = write_directory / sample_h5ad
        sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 3402 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 11183 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 17509 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 10871 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'


In [6]:
sample.var

Unnamed: 0,gene_ids,feature_types
MIR1302-10,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression
...,...,...
AC145205.1,ENSG00000215635,Gene Expression
BAGE5,ENSG00000268590,Gene Expression
CU459201.1,ENSG00000251180,Gene Expression
AC002321.2,ENSG00000215616,Gene Expression


In [7]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,age,sex,recurrent,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE195709_AAACCCAAGAGGCCAT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGAGGCCAT-1
GSE195709_AAACCCAAGAGGTGCT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGAGGTGCT-1
GSE195709_AAACCCAAGCGGTATG-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGCGGTATG-1
GSE195709_AAACCCAAGCTATCTG-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGCTATCTG-1
GSE195709_AAACCCAAGTCTGGTT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGTCTGGTT-1
...,...,...,...,...,...,...,...,...
GSE195709_TTTGTTGTCAAGGAGC-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCAAGGAGC-1
GSE195709_TTTGTTGTCCTGTTAT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCCTGTTAT-1
GSE195709_TTTGTTGTCGAGCTGC-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCGAGCTGC-1
GSE195709_TTTGTTGTCTGCGATA-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCTGCGATA-1


### 3. Confirmation of created AnnData objects


In [8]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE195709')

# Loop through all files in the directory
for file in write_directory.iterdir():
    if '_uni.h5ad' in file.name:
        sample = anndata.read_h5ad(file)
        print(sample)

AnnData object with n_obs × n_vars = 11183 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 17509 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 3402 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 10871 × 32738
    obs: 'cancer_type', 'dataset', 'age', 'sex', 'recurrent', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types'


In [9]:
sample.var

Unnamed: 0,gene_ids,feature_types
MIR1302-10,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression
...,...,...
AC145205.1,ENSG00000215635,Gene Expression
BAGE5,ENSG00000268590,Gene Expression
CU459201.1,ENSG00000251180,Gene Expression
AC002321.2,ENSG00000215616,Gene Expression


In [10]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,age,sex,recurrent,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE195709_AAACCCAAGAGGCCAT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGAGGCCAT-1
GSE195709_AAACCCAAGAGGTGCT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGAGGTGCT-1
GSE195709_AAACCCAAGCGGTATG-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGCGGTATG-1
GSE195709_AAACCCAAGCTATCTG-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGCTATCTG-1
GSE195709_AAACCCAAGTCTGGTT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_AAACCCAAGTCTGGTT-1
...,...,...,...,...,...,...,...,...
GSE195709_TTTGTTGTCAAGGAGC-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCAAGGAGC-1
GSE195709_TTTGTTGTCCTGTTAT-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCCTGTTAT-1
GSE195709_TTTGTTGTCGAGCTGC-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCGAGCTGC-1
GSE195709_TTTGTTGTCTGCGATA-1,rhabdomyosarcoma,GSE195709,15,male,recurrent,unknown,GSM5848676_29806_hg19,GSE195709_TTTGTTGTCTGCGATA-1


### 4. Convert AnnData objects to SingleCellExperiment objects


In [11]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE195709')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

In [12]:
print(sample_sce)

class: SingleCellExperiment 
dim: 32738 10871 
metadata(0):
assays(1): X
rownames(32738): MIR1302-10 FAM138A ... AC002321.2 AC002321.1
rowData names(2): gene_ids feature_types
colnames(10871): GSE195709_AAACCCAAGAGGCCAT-1
  GSE195709_AAACCCAAGAGGTGCT-1 ... GSE195709_TTTGTTGTCTGCGATA-1
  GSE195709_TTTGTTGTCTTCGACC-1
colData names(8): cancer_type dataset ... sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

