### 1. General info of dataset GSE223373

This is the Jupyter Notebook for dataset GSE223373. Its dataset includes barcodes/genes/matrix files for each sample. There are 3 samples.
	
GSM5139852	RB01_mRNA_rep1

GSM5139853	RB01_mRNA_rep2

GSM5139854	RB02_mRNA_rep1

GSM5139855	RB02_mRNA_rep2

GSM5139856	RB03_mRNA_rep1

GSM5139857	RB03_mRNA_rep2

GSM5139858	RB04_mRNA

GSM5139859	RB05_mRNA

GSM5139860	RB06_mRNA

GSM5139861	RB07_mRNA

In [2]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

### 2. AnnData object of each sample

<span style="color:red">**IMPORTANT:**</span> rename files to get rid of prefixes

1. `barcodes.tsv`: cell barcodes, which go into `.obs`
2. `genes.tsv`: gene names, `.var`
3. `matrix.mtx`: the expression matrix, `.X`

In [2]:
from pathlib import Path

In [5]:
data_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE168434_RAW/')
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE168434/')

In [9]:
import os

In [16]:
# Original files were nexted in multiple subdirectories, so must create new directories and manually move files

dir_list = data_directory.iterdir()
for sample_directory in dir_list:
    os.mkdir('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE168434_RAW/'+str.split(sample_directory.stem,'.')[0])

In [18]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE168434_RAW/')
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE168434/')

# Loop through all files in the directory
for sample_directory in data_directory.iterdir():
    sample_name = sample_directory.stem
    sample_h5ad = sample_name + '_uni.h5ad'

    sample = sc.read_10x_mtx(
    sample_directory,
    var_names='gene_symbols',  
    cache=False
    )

    # Create an observation metric info to store related features
    obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
    obs_metrics['cancer_type'] = 'Retinoblastoma'
    obs_metrics['dataset'] = 'GSE168434'
    obs_metrics['tissue'] = 'Retina'
    obs_metrics['sample_barcode'] = sample_name
    obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)
    
    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=True, inplace=True)
    print(sample)

    # save the anndata object
    output_path = write_directory.joinpath(sample_h5ad)
    sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 14407 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6990 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 7596 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6826 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 7638 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 14681 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 2140 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_b

### 3. Confirmation of created AnnData objects

In [4]:
from pathlib import Path

# Specify directory paths
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE168434/')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample = anndata.read_h5ad(file)
    print(sample)

AnnData object with n_obs × n_vars = 4185 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 7638 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 14681 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 2140 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 6990 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 14407 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 14093 × 33538
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_

In [20]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GSE168434_AAACCTGAGAAGAAGC-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_AAACCTGAGACAAGCC-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_AAACCTGAGCGCCTCA-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_AAACCTGAGGCGACAT-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_AAACCTGAGGTAGCCA-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
...,...,...,...,...
GSE168434_TTTGTCATCCCATTTA-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_TTTGTCATCCGTACAA-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_TTTGTCATCGAATGCT-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts
GSE168434_TTTGTCATCGTCCGTT-1,Retinoblastoma,GSE168434,Retina,GSM5139859_RB05_counts


In [21]:
sample.var

Unnamed: 0,gene_ids,feature_types
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC233755.2,ENSG00000277856,Gene Expression
AC233755.1,ENSG00000275063,Gene Expression
AC240274.1,ENSG00000271254,Gene Expression
AC213203.1,ENSG00000277475,Gene Expression


In [5]:
sample.X[1:10, 1:10].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

### 4. Convert AnnData objects to SingleCellExperiment objects

In [3]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE168434/')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        # print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

PackageNotInstalledError: The R package "S4Vectors" is not installed.