### 1. General info of dataset GSE235923

This is the Jupyter Notebook for dataset GSE235923. Its dataset includes barcodes/features/matrix files for each sample.

Thus, we need to simply incorparate these barcodes/genes/matrix files and generate an AnnData object for each sample. 

In total, there are 31 acute myeloid leukemia (AML) samples.

<span style="color:green">**[Dx/D]**</span> samples from diagnosis

<span style="color:green">**[EOI]**</span> samples from end of induction

<span style="color:green">**[R]**</span> samples from relapse 

In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes 

1. `barcodes.tsv.gz`: cell barcodes, which go into `.obs`
2. `features.tsv.gz`: gene names, `.var`
3. `matrix.mtx.gz`: the expression matrix, `.X`

In [2]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE235923')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE235923')

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    sample_name = sample_directory.stem
    sample_h5ad = sample_name + '_uni.h5ad'

    sample = sc.read_10x_mtx(
    sample_directory,
    var_names='gene_symbols',  
    cache=False
    )

    # Create an observation metric info to store related features
    obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
    obs_metrics['cancer_type'] = 'acute_myeloid_leukemia'
    obs_metrics['dataset'] = 'GSE235923'
    obs_metrics['tissue'] = 'bone_marrow'
    obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
    
    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    output_path = write_directory / sample_h5ad
    sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 3495 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 3758 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 2214 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 5041 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 2413 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 9588 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 3057 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_

In [9]:
# Add "sample_barcode" and "disease_progression" info in .obs for each sample
from pathlib import Path

# Specify directory paths
rdm_directory = Path('/QRISdata/Q6104/Xiaohan/2_AnnData_objs/GSE235923')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE235923')

# Loop through all files in the directory
for file in rdm_directory.iterdir():
    if "_uni.h5ad" in file.name:
        sample = anndata.read_h5ad(file)

        letter = file.stem.split("_uni")[0][-1]
        if letter == 'D' or letter == 'x':
            disease_progression = 'diagnosis'
        elif letter == 'E':
            disease_progression = 'end_of_induction'
        elif letter == 'R':
            disease_progression = 'relapse'
        
        sample.obs['disease_progression'] = disease_progression
        sample.obs['sample_barcode'] = file.stem.split("_uni")[0]

        # save the anndata object
        output_path = write_directory / file.name
        sample.write_h5ad(output_path, compression="gzip")


### 3. Confirmation of created AnnData objects

In [10]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE235923')
disease_progressions = set()
sample_barcodes = set()

# Loop through all files in the directory
for file in write_directory.iterdir():
    if "_uni.h5ad" in file.name:
        sample = anndata.read_h5ad(file)
        disease_progressions.update(set(sample.obs['disease_progression']))
        sample_barcodes.update(set(sample.obs['sample_barcode']))

print(disease_progressions)
print(sample_barcodes)
print(len(sample_barcodes))

{'relapse', 'end_of_induction', 'diagnosis'}
{'GSM7512021_Sample17D', 'GSM7512003_Sample5D', 'GSM7511998_Sample1D', 'GSM7512010_Sample8D', 'GSM7512014_Sample12D', 'GSM7512002_Sample4D', 'GSM7512025_Sample19D', 'GSM7512000_Sample3D', 'GSM7512016_Sample14D', 'GSM7512022_Sample17E', 'GSM7512009_Sample7D', 'GSM7512027_Sample20D', 'GSM7512005_Sample5R', 'GSM7512013_Sample11D', 'GSM7512026_Sample19E', 'GSM7512004_Sample5E', 'GSM7512018_Sample15E', 'GSM7512015_Sample13D', 'GSM7512017_Sample14E', 'GSM7512007_Sample6E', 'GSM7512011_Sample9D', 'GSM7512028_Sample20E', 'GSM7511999_Sample2D', 'GSM7512023_Sample18Dx', 'GSM7512012_Sample10D', 'GSM7512019_Sample16D', 'GSM7512008_Sample6R', 'GSM7512006_Sample6D', 'GSM7512020_Sample16E', 'GSM7512001_Sample3E', 'GSM7512024_Sample18E'}
31


In [11]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,uni_barcode,disease_progression,sample_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GSE235923_AAACCTGAGAGCTATA-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_AAACCTGAGAGCTATA-1,diagnosis,GSM7512014_Sample12D
GSE235923_AAACCTGAGGGTTCCC-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_AAACCTGAGGGTTCCC-1,diagnosis,GSM7512014_Sample12D
GSE235923_AAACCTGGTTCTGTTT-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_AAACCTGGTTCTGTTT-1,diagnosis,GSM7512014_Sample12D
GSE235923_AAACGGGCACCCTATC-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_AAACGGGCACCCTATC-1,diagnosis,GSM7512014_Sample12D
GSE235923_AAACGGGCAGGACCCT-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_AAACGGGCAGGACCCT-1,diagnosis,GSM7512014_Sample12D
...,...,...,...,...,...,...
GSE235923_TTTGTCAAGAAGGTGA-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_TTTGTCAAGAAGGTGA-1,diagnosis,GSM7512014_Sample12D
GSE235923_TTTGTCAAGTGAACAT-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_TTTGTCAAGTGAACAT-1,diagnosis,GSM7512014_Sample12D
GSE235923_TTTGTCACAAGTTAAG-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_TTTGTCACAAGTTAAG-1,diagnosis,GSM7512014_Sample12D
GSE235923_TTTGTCAGTTATGCGT-1,acute_myeloid_leukemia,GSE235923,bone_marrow,GSE235923_TTTGTCAGTTATGCGT-1,diagnosis,GSM7512014_Sample12D


### 4. Convert AnnData objects to SingleCellExperiment objects

In [12]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE235923')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

In [14]:
print(sample_sce)

class: SingleCellExperiment 
dim: 33538 2276 
metadata(0):
assays(1): X
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(2): gene_ids feature_types
colnames(2276): GSE235923_AAACCTGAGAGCTATA-1
  GSE235923_AAACCTGAGGGTTCCC-1 ... GSE235923_TTTGTCAGTTATGCGT-1
  GSE235923_TTTGTCATCTACTTAC-1
colData names(6): cancer_type dataset ... disease_progression
  sample_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

