### 1. General info of dataset GSE227122

This is the Jupyter Notebook for dataset GSE227122. Its dataset includes barcodes/features/matrix files for each sample.

Thus, we need to simply incorparate these barcodes/features/matrix files and generate an AnnData object for each sample. 
In total, there are 16 T-cell acute lymphoblastic leukemia (T-ALL) samples.

<span style="color:green">**[Dx]**</span> patients at diagnosis

<span style="color:green">**[EOI]**</span> patients after therapy

<span style="color:green">**[Rel]**</span> patients at relapse

In [2]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes 

1. `barcodes.tsv.gz`: cell barcodes, which go into `.obs`
2. `features.tsv.gz`: gene names, `.var`
3. `matrix.mtx.gz`: the expression matrix, `.X`

In [None]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE227122')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE227122')

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    sample_name = sample_directory.stem
    sample_h5ad = sample_name + '_uni.h5ad'

    sample = sc.read_10x_mtx(
    sample_directory,
    var_names='gene_symbols',  
    cache=False
    )

    # Create an observation metric info to store related features
    obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
    obs_metrics['cancer_type'] = 'T-cell_acute_lymphoblastic_leukemia'
    obs_metrics['dataset'] = 'GSE227122'
    obs_metrics['tissue'] = 'bone_marrow'
    obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
    
    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    output_path = write_directory / sample_h5ad
    sample.write_h5ad(output_path, compression="gzip")

### 3. Confirmation of created AnnData objects

In [8]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE227122')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample = anndata.read_h5ad(file)
    print(sample)

AnnData object with n_obs × n_vars = 3522 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 2894 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 3530 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 1073 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 949 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 1845 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 4034 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_i

### 4. Convert AnnData objects to SingleCellExperiment objects

In [10]:
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

with localconverter(anndata2ri.converter):
    sce = anndata2ri.py2rpy(sample)
    print(sce)

class: SingleCellExperiment 
dim: 33538 3405 
metadata(0):
assays(1): X
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(2): gene_ids feature_types
colnames(3405): GSE227122_AAACCTGAGAGTCTGG-1
  GSE227122_AAACCTGCAGCCTGTG-1 ... GSE227122_TTTGTCATCGAGCCCA-1
  GSE227122_TTTGTCATCGATGAGG-1
colData names(4): cancer_type dataset tissue uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):



In [34]:
import rpy2.robjects as robjects

robjects.globalenv["sce"] = sce

file_path = "write/test_rds/sce.rds"
robjects.r("saveRDS(sce, file='{}')".format(file_path))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f41cdb619d0> [0]

In [22]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE227122')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in sample_name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))