### 1. General info of dataset PRJNA737188

This is the Jupyter Notebook for dataset PRJNA737188. The raw sequencing data (from two runs) is downloaded and put into Cell Ranger. The resulting filtered_feature_bc_matrix.h5 files are used for processing.

In [25]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. Inspection of processed AnnData objects

In [26]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/PRJNA737188')

SRR14800535_path = data_directory / 'SRR14800535_filtered_feature_bc_matrix.h5'
SRR14800536_path = data_directory / 'SRR14800536_filtered_feature_bc_matrix.h5'

SRR14800535 = sc.read_10x_h5(SRR14800535_path)
SRR14800536 = sc.read_10x_h5(SRR14800536_path)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [27]:
SRR14800535

AnnData object with n_obs × n_vars = 5577 × 36601
    var: 'gene_ids', 'feature_types', 'genome'

In [28]:
SRR14800535.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
AC141272.1,ENSG00000277836,Gene Expression,GRCh38
AC023491.2,ENSG00000278633,Gene Expression,GRCh38
AC007325.1,ENSG00000276017,Gene Expression,GRCh38
AC007325.4,ENSG00000278817,Gene Expression,GRCh38


In [29]:
SRR14800535.obs

AAACCTGAGCCCAATT-1
AAACCTGAGTAGGCCA-1
AAACCTGAGTGCGATG-1
AAACCTGCAAAGTCAA-1
AAACCTGCACCTCGTT-1
...
TTTGTCATCCAACCAA-1
TTTGTCATCCGTACAA-1
TTTGTCATCGCGCCAA-1
TTTGTCATCGTACGGC-1
TTTGTCATCTTTCCTC-1


In [30]:
SRR14800536

AnnData object with n_obs × n_vars = 12343 × 36601
    var: 'gene_ids', 'feature_types', 'genome'

In [31]:
SRR14800536.var

Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
AC141272.1,ENSG00000277836,Gene Expression,GRCh38
AC023491.2,ENSG00000278633,Gene Expression,GRCh38
AC007325.1,ENSG00000276017,Gene Expression,GRCh38
AC007325.4,ENSG00000278817,Gene Expression,GRCh38


In [32]:
SRR14800536.obs

AAACCTGAGAGTGAGA-1
AAACCTGAGCGAGAAA-1
AAACCTGAGGAGCGAG-1
AAACCTGAGGCAAAGA-1
AAACCTGAGGGTATCG-1
...
TTTGTCAGTCGAGATG-1
TTTGTCAGTTACGGAG-1
TTTGTCATCCAATGGT-1
TTTGTCATCTCCCTGA-1
TTTGTCATCTGGAGCC-1


### 3. Add feature info in .obs

In [33]:
# load the metadata
age_sex = {
    'SRR14800535': [2, 'male'],
    'SRR14800536': [2, 'female'],
}

In [34]:
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/PRJNA737188')

adatas = (SRR14800535, SRR14800536)

SRR14800535_h5ad = write_directory / 'SRR14800535_uni.h5ad'
SRR14800536_h5ad = write_directory / 'SRR14800536_uni.h5ad'
paths = (SRR14800535_h5ad, SRR14800536_h5ad)

for adata, path in zip(adatas, paths):
    patient_id = path.stem.split('_')[0]

    adata.var.rename_axis('gene_symbols', inplace=True)
    
    adata.obs['cancer_type'] = 'retinoblastoma'
    adata.obs['dataset'] = 'PRJNA737188'
    adata.obs['tissue'] = 'retina'
    adata.obs['age'] = age_sex[patient_id][0]
    adata.obs['sex'] = age_sex[patient_id][1]
    adata.obs['sample_barcode'] = 'PRJNA737188_' + patient_id
    adata.obs['uni_barcode'] = adata.obs['sample_barcode'] + '_' + adata.obs.index
    adata.obs.set_index("uni_barcode", drop=False, inplace=True)

    print(adata)
   
    # Save the modified AnnData objects
    adata.write_h5ad(path, compression="gzip")
    

AnnData object with n_obs × n_vars = 5577 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'
AnnData object with n_obs × n_vars = 12343 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


In [35]:
adata.var

Unnamed: 0_level_0,gene_ids,feature_types,genome
gene_symbols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
AC141272.1,ENSG00000277836,Gene Expression,GRCh38
AC023491.2,ENSG00000278633,Gene Expression,GRCh38
AC007325.1,ENSG00000276017,Gene Expression,GRCh38
AC007325.4,ENSG00000278817,Gene Expression,GRCh38


In [36]:
adata.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,age,sex,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PRJNA737188_SRR14800536_AAACCTGAGAGTGAGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGAGTGAGA-1
PRJNA737188_SRR14800536_AAACCTGAGCGAGAAA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGCGAGAAA-1
PRJNA737188_SRR14800536_AAACCTGAGGAGCGAG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGAGCGAG-1
PRJNA737188_SRR14800536_AAACCTGAGGCAAAGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGCAAAGA-1
PRJNA737188_SRR14800536_AAACCTGAGGGTATCG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGGTATCG-1
...,...,...,...,...,...,...,...
PRJNA737188_SRR14800536_TTTGTCAGTCGAGATG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCAGTCGAGATG-1
PRJNA737188_SRR14800536_TTTGTCAGTTACGGAG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCAGTTACGGAG-1
PRJNA737188_SRR14800536_TTTGTCATCCAATGGT-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCATCCAATGGT-1
PRJNA737188_SRR14800536_TTTGTCATCTCCCTGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCATCTCCCTGA-1


### 4. Confirmation of created AnnData objects

In [37]:
from pathlib import Path

write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/PRJNA737188')

for file in write_directory.iterdir():
    if '_uni.h5ad' in file.name:
        sample = anndata.read_h5ad(file)
        print(sample)

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 5577 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'
AnnData object with n_obs × n_vars = 12343 × 36601
    obs: 'cancer_type', 'dataset', 'tissue', 'age', 'sex', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids', 'feature_types', 'genome'


  utils.warn_names_duplicates("var")


In [38]:
sample.var

Unnamed: 0_level_0,gene_ids,feature_types,genome
gene_symbols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
AC141272.1,ENSG00000277836,Gene Expression,GRCh38
AC023491.2,ENSG00000278633,Gene Expression,GRCh38
AC007325.1,ENSG00000276017,Gene Expression,GRCh38
AC007325.4,ENSG00000278817,Gene Expression,GRCh38


In [39]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,age,sex,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PRJNA737188_SRR14800536_AAACCTGAGAGTGAGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGAGTGAGA-1
PRJNA737188_SRR14800536_AAACCTGAGCGAGAAA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGCGAGAAA-1
PRJNA737188_SRR14800536_AAACCTGAGGAGCGAG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGAGCGAG-1
PRJNA737188_SRR14800536_AAACCTGAGGCAAAGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGCAAAGA-1
PRJNA737188_SRR14800536_AAACCTGAGGGTATCG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_AAACCTGAGGGTATCG-1
...,...,...,...,...,...,...,...
PRJNA737188_SRR14800536_TTTGTCAGTCGAGATG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCAGTCGAGATG-1
PRJNA737188_SRR14800536_TTTGTCAGTTACGGAG-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCAGTTACGGAG-1
PRJNA737188_SRR14800536_TTTGTCATCCAATGGT-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCATCCAATGGT-1
PRJNA737188_SRR14800536_TTTGTCATCTCCCTGA-1,retinoblastoma,PRJNA737188,retina,2,female,PRJNA737188_SRR14800536,PRJNA737188_SRR14800536_TTTGTCATCTCCCTGA-1


### 5. Convert AnnData objects to SingleCellExperiment objects

In [40]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/PRJNA737188')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
            print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

  utils.warn_names_duplicates("var")
       'GGT1', 'GOLGA8M', 'GOLGA8M', 'HSPA14', 'HSPA14', 'LINC01238',
       'LINC01238', 'LINC01505', 'LINC01505', 'MATR3', 'MATR3', 'TBCE', 'TBCE',
       'TMSB15B', 'TMSB15B'],
      dtype='object', name='gene_symbols')
  return dispatch(args[0].__class__)(*args, **kw)


class: SingleCellExperiment 
dim: 36601 5577 
metadata(0):
assays(1): X
rownames(36601): MIR1302-2HG FAM138A ... AC007325.4 AC007325.2
rowData names(3): gene_ids feature_types genome
colnames(5577): PRJNA737188_SRR14800535_AAACCTGAGCCCAATT-1
  PRJNA737188_SRR14800535_AAACCTGAGTAGGCCA-1 ...
  PRJNA737188_SRR14800535_TTTGTCATCGTACGGC-1
  PRJNA737188_SRR14800535_TTTGTCATCTTTCCTC-1
colData names(7): cancer_type dataset ... sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):



  utils.warn_names_duplicates("var")
       'GGT1', 'GOLGA8M', 'GOLGA8M', 'HSPA14', 'HSPA14', 'LINC01238',
       'LINC01238', 'LINC01505', 'LINC01505', 'MATR3', 'MATR3', 'TBCE', 'TBCE',
       'TMSB15B', 'TMSB15B'],
      dtype='object', name='gene_symbols')
  return dispatch(args[0].__class__)(*args, **kw)


class: SingleCellExperiment 
dim: 36601 12343 
metadata(0):
assays(1): X
rownames(36601): MIR1302-2HG FAM138A ... AC007325.4 AC007325.2
rowData names(3): gene_ids feature_types genome
colnames(12343): PRJNA737188_SRR14800536_AAACCTGAGAGTGAGA-1
  PRJNA737188_SRR14800536_AAACCTGAGCGAGAAA-1 ...
  PRJNA737188_SRR14800536_TTTGTCATCTCCCTGA-1
  PRJNA737188_SRR14800536_TTTGTCATCTGGAGCC-1
colData names(7): cancer_type dataset ... sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

