### 1. General info of dataset GSE192906

This is the Jupyter Notebook for dataset GSE192906. Its dataset includes an overall big txt file. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate an overall AnnData object for all samples.

NB: neuroblastoma

GNB: ganglioneuroblastoma

In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [5]:
import os
os.getcwd()

'/scratch/user/uqjsaxo1'

In [6]:
# inspect the first dataset
path = '/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE192906_RAW/GSM5768743_NB1_UMI_COUNTS_RAW.txt'
input = pd.read_csv(path, sep='\t', index_col=0) # the first column contains gene names and is the index

print(input.head()) 
print(input.shape) # (23686 rows, 4058 columns)

             A1_1_0316_AACGAGGT  A1_1_0316_AAGCACAT  A1_1_0316_ACACCGTG  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             A1_1_0316_ACCTCAGC  A1_1_0316_ACTGTTTG  A1_1_0316_AGCTCCTT  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             A1_1_0316_ATTTAGCG  A1_1_0316_CACACTGA  A1_1_0316_CACAGCAT  \
MIR1302-2HG            

As shown above, the dataset contains 4058 cells and 23686 genes.

### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [11]:
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/uqjsaxo1/xiaohan-john-project/data/GSE192906_RAW/')

# Loop through all files in the directory
for file in data_directory.iterdir():
    sample = pd.read_csv(file, sep='\t', index_col=0)
    matrix = scipy.sparse.csr_matrix(sample.values.T)
    obs_name = pd.DataFrame(index=sample.columns)
    var_name = pd.DataFrame(sample.index)
    var_name.rename(columns={0: 'gene_symbols'}, inplace=True)

    sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)
    print(sample)

    # Create an observation metric info to store related features
    obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
    if 'GNB' in file.stem:
        obs_metrics['cancer_type'] = 'Ganglioneuroblastoma'
    elif 'NB' in file.stem:
        obs_metrics['cancer_type'] = 'Neuroblastoma'
    else: 
        obs_metrics['cancer_type'] = 'Ganglioneuroma' 
    obs_metrics['dataset'] = 'GSE192906'
    obs_metrics['tissue'] = 'Neuronal'
    obs_metrics['sample_barcode'] = file.stem
    obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)

    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    sample.write_h5ad(f'/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE192906/{file.stem}_uni.h5ad', compression="gzip")



AnnData object with n_obs × n_vars = 740 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 740 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 445 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 445 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 357 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 357 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 360 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 360 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 768 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 768 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 960 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 960 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 639 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 639 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 551 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 551 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 1052 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 1052 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'




AnnData object with n_obs × n_vars = 1053 × 33514
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 1053 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'


In [12]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE192906_A1_1_0314_AACGAGGT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_AACGAGGT
GSE192906_A1_1_0314_AAGCACAT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_AAGCACAT
GSE192906_A1_1_0314_ACACCGTG,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACACCGTG
GSE192906_A1_1_0314_ACCTCAGC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACCTCAGC
GSE192906_A1_1_0314_ACTGTTTG,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACTGTTTG
...,...,...,...,...,...
GSE192906_K2_2_0314_TCTCACAC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TCTCACAC
GSE192906_K2_2_0314_TGGAGCTC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TGGAGCTC
GSE192906_K2_2_0314_TGTACCAA,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TGTACCAA
GSE192906_K2_2_0314_TTACGGGT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TTACGGGT


### 3. Confirmation of created AnnData object

In [13]:
output = '/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE192906/GSM5768743_NB1_UMI_COUNTS_RAW_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 960 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'


### 4. Convert AnnData objects to SingleCellExperiment objects

In [14]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE102130')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))