### 1. General info of dataset GSE137804

This is the Jupyter Notebook for dataset GSE137804. Its dataset includes a xls file for each sample. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate the AnnData object for each sample. In total, there are 22 samples.

In [2]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [24]:
# inspect one tumor dataset generated from 10x Genomics v2
tumor_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSM4088774_T10_gene_cell_exprs_table.xls.gz'
tumor_input = pd.read_csv(tumor_path, sep='\t', index_col=1) # the second column contains gene names and is the index

print(tumor_input.head()) 
print(tumor_input.shape) 

                  Gene_ID  AAACCTGAGCTCAACT.1  AAACCTGAGGAGCGTT.1  \
Symbol                                                              
TSPAN6    ENSG00000000003                   1                   0   
DPM1      ENSG00000000419                   0                   1   
SCYL3     ENSG00000000457                   0                   0   
C1orf112  ENSG00000000460                   0                   0   
FGR       ENSG00000000938                   0                   0   

          AAACCTGCACCAGATT.1  AAACCTGGTCAGAATA.1  AAACCTGTCTTTAGTC.1  \
Symbol                                                                 
TSPAN6                     0                   0                   0   
DPM1                       0                   1                   0   
SCYL3                      0                   0                   0   
C1orf112                   0                   0                   0   
FGR                        0                   0                   0   

          A

In [14]:
# inspect one tumor dataset generated from 10x Genomics v3
tumor_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSM4654672_T200_gene_cell_exprs_table.xls.gz'
tumor_input = pd.read_csv(tumor_path, sep='\t', index_col=0) # the second column contains gene names and is the index

print(tumor_input.head()) 
print(tumor_input.shape) 

             AAACCCAAGATACATG.1  AAACCCAAGATGGGCT.1  AAACCCAAGCGTTAGG.1  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             AAACCCAAGGTAGCAC.1  AAACCCAAGGTCACAG.1  AAACCCAAGTTCATCG.1  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             AAACCCACAACAAGAT.1  AAACCCACACCCAAGC.1  AAACCCACACCTATCC.1  \
MIR1302-2HG            

In [3]:
# inspect the tumor annotation file
tumor_annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSE137804_tumor_dataset_annotation.csv.gz'
tumor_annotation = pd.read_csv(tumor_annotation_path, sep=',', index_col=0) 

tumor_annotation

Unnamed: 0_level_0,sample,celltype
cellname,Unnamed: 1_level_1,Unnamed: 2_level_1
T10_AAGACCTAGTACTTGC.1,T10,tumor
T10_TTTACTGCATGTAGTC.1,T10,tumor
T10_AGAGCGAAGTTGAGTA.1,T10,tumor
T10_CACCTTGAGTGTGGCA.1,T10,tumor
T10_GAAATGAGTGGTACAG.1,T10,tumor
...,...,...
T230_TGTAACGTCCTAAGTG-1,T230,Tcell
T230_TTCATGTGTGATAGTA-1,T230,Tcell
T230_TTCGGTCAGTCGAAAT-1,T230,Tcell
T230_TTCTTCCAGGGAGTTC-1,T230,Tcell


In [76]:
# inspect the embryo dataset
embryo_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSM4654667_F366_gene_cell_exprs_table.xls.gz'
embryo_input = pd.read_csv(embryo_path, sep='\t', index_col=0) # the second column contains gene names and is the index

print(embryo_input.head()) 

             AAACCCAAGACTAAGT.1  AAACCCAAGAGGCGGA.1  AAACCCAAGGATACCG.1  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             AAACCCAAGGTTGGAC.1  AAACCCAAGTCAGGGT.1  AAACCCAAGTCTACCA.1  \
MIR1302-2HG                   0                   0                   0   
FAM138A                       0                   0                   0   
OR4F5                         0                   0                   0   
AL627309.1                    0                   0                   0   
AL627309.3                    0                   0                   0   

             AAACCCAAGTCTCCTC.1  AAACCCACAAGCACCC.1  AAACCCACAAGTGCAG.1  \
MIR1302-2HG            

In [4]:
# inspect the embryo annotation file
embryo_annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSE137804_Embryo_annotation.csv.gz'
embryo_annotation = pd.read_csv(embryo_annotation_path, sep=',', index_col=0) 

embryo_annotation

Unnamed: 0_level_0,sample,annotation
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGATTGAGT.1,F379,Mesenchyme
AAACCCACAACCGTGC.1,F379,Motor neuron
AAACCCACAAGAAATC.1,F379,Mesenchyme
AAACCCACACGGTGCT.1,F379,Cadiomycytes
AAACCCACATGAATCC.1,F379,Neural crest
...,...,...
TTTGTTGTCCATGATG.1,F366,Mesenchyme
TTTGTTGTCCCATTTA.1,F366,Mesenchyme
TTTGTTGTCCGCATAA.1,F366,Mesenchyme
TTTGTTGTCCTATGGA.1,F366,Gut endoderm epithelium


The count matrix file for embryos is different from the one for tumor samples, so need to process separately

In [87]:
# inspect the adrenal dataset
adrenal_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSM4088785_F2_gene_cell_exprs_table.xls.gz'
adrenal_input = pd.read_csv(adrenal_path, sep='\t', index_col=1) # the second column contains gene names and is the index

print(adrenal_input.head()) 

                  Gene_ID  AAACCTGAGACCTTTG.1  AAACCTGAGCGTTGCC.1  \
Symbol                                                              
TSPAN6    ENSG00000000003                   1                   0   
TNMD      ENSG00000000005                   0                   0   
DPM1      ENSG00000000419                   0                   0   
SCYL3     ENSG00000000457                   0                   0   
C1orf112  ENSG00000000460                   0                   0   

          AAACCTGAGGAATTAC.1  AAACCTGAGGTGCAAC.1  AAACCTGAGTACTTGC.1  \
Symbol                                                                 
TSPAN6                     1                   0                   1   
TNMD                       0                   0                   0   
DPM1                       1                   0                   1   
SCYL3                      0                   0                   0   
C1orf112                   0                   0                   0   

          A

In [5]:
# inspect the adrenal annotation file
adrenal_annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE137804/GSE137804_Adrenal_gland_annotation.csv.gz'
adrenal_annotation = pd.read_csv(adrenal_annotation_path, sep=',', index_col=0) 

adrenal_annotation

Unnamed: 0_level_0,sample,annotation
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AAATGCCAGCTGTCTA.1_1,F2,Steroidogenic cells
AAATGCCTCACAATGC.1_1,F2,Steroidogenic cells
AACACGTGTCATGCCG.1_1,F2,Steroidogenic cells
AACCGCGAGATCTGAA.1_1,F2,Steroidogenic cells
AACGTTGTCGCTTAGA.1_1,F2,Steroidogenic cells
...,...,...
TTGACTTGTGTTCGAT.1_4,F107,SCPs
TTGGCAAAGAGGTTAT.1_4,F107,SCPs
TTGTAGGAGGCGTACA.1_4,F107,SCPs
TTTGGTTAGATGCCTT.1_4,F107,SCPs


<span style="color:red">**PROBLEM:**</span> the cell names between the count matrix file and the annotation file are different

### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [16]:
# Load the metadata

embryo = ['F366', 'F379']
adrenal = ['F2', 'F7', 'F106', 'F107']
tumor = ['T10', 'T19', 'T27', 'T34', 'T40', 'T44', 'T69', 'T71', 'T75', 
         'T92', 'T162', 'T175', 'T188', 'T200', 'T214', 'T230'] # the downloaded sample file includes T175 instead of T174

v2 = adrenal + ['T10', 'T19', 'T27', 'T34', 'T40', 'T44', 'T69', 'T71', 'T75', 'T92']
v3 = embryo + ['T162', 'T175', 'T188', 'T200', 'T214', 'T230']

tissue_dict = { # can simplify this if have time later
    'GSM4088774': 'adrenal_gland', 
    'GSM4088775': 'adrenal_gland',
    'GSM4088776': 'adrenal_gland',
    'GSM4088777': 'adrenal_gland',
    'GSM4088778': 'adrenal_gland',
    'GSM4088779': 'adrenal_gland',
    'GSM4088780': 'adrenal_gland',
    'GSM4088781': 'adrenal_gland',
    'GSM4088782': 'adrenal_gland',
    'GSM4088783': 'adrenal_gland',
    'GSM4088785': 'adrenal_gland',
    'GSM4088786': 'adrenal_gland',
    'GSM4088787': 'adrenal_gland',
    'GSM4088788': 'adrenal_gland',
    'GSM4654667': 'embryo', 
    'GSM4654668': 'embryo',
    'GSM4654669': 'adrenal_gland',
    'GSM4654670': 'adrenal_gland',
    'GSM4654671': 'adrenal_gland',
    'GSM4654672': 'adrenal_gland',
    'GSM4654673': 'adrenal_gland',
    'GSM4654674': 'adrenal_gland',
}

sex_age_cancer_dict = {
    'T10': ['male', 3, 'ganglioneuroblastoma'],
    'T19': ['female', 2, 'ganglioneuroblastoma'],
    'T27': ['male', 3, 'neuroblastoma'],
    'T34': ['female', 3, 'neuroblastoma'],
    'T40': ['male', 0.17, 'neuroblastoma'],
    'T44': ['female', 0.17, 'neuroblastoma'],
    'T69': ['male', 2, 'neuroblastoma'],
    'T71': ['male', 0.01, 'neuroblastoma'],
    'T75': ['female', 0.75, 'neuroblastoma'],
    'T92': ['female', 3, 'neuroblastoma'],
    'T162': ['male', 2, 'neuroblastoma'],
    'T175': ['male', 0.06, 'neuroblastoma'],
    'T188': ['male', 0.25, 'neuroblastoma'],
    'T200': ['female', 2, 'neuroblastoma'],
    'T214': ['female', 0.25, 'neuroblastoma'],
    'T230': ['male', 4, 'neuroblastoma'],
}

adrenal_cell_suffix = {
    'F2': '_1',	
    'F7': '_2',
    'F106': '_3',	
    'F107': '_4',
}

In [22]:
import warnings
from pathlib import Path

warnings.filterwarnings("ignore", category=UserWarning)

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE137804')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE137804')

# Loop through all files in the directory
for file in data_directory.iterdir():
    if 'annotation' not in file.stem:
        gsm, patient_id = file.stem.split('_')[:2]
        
        if patient_id in v3: # it is embryo sample
            input = pd.read_csv(file, sep='\t', index_col=0)
            var_df = pd.DataFrame(index=input.index)
        else:
            input = pd.read_csv(file, sep='\t', index_col=1)
            var_df = input.loc[:, ['Gene_ID']]
            input.drop(columns=['Gene_ID'], inplace=True)

        matrix = scipy.sparse.csr_matrix(input.values.T)
        obs_name = pd.DataFrame(index=input.columns)
        var_name = var_df
        var_name.rename_axis('gene_symbols', inplace=True)

        sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)

        # Create an observation metric info to store related features
        obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers

        obs_metrics['dataset'] = 'GSE137804'
        obs_metrics['tissue'] = tissue_dict[gsm]

        # Load the metadata
        if patient_id in tumor: # it is embryo sample
            obs_metrics['sex'] = sex_age_cancer_dict[patient_id][0]
            obs_metrics['age'] = sex_age_cancer_dict[patient_id][1]
            obs_metrics['cancer_type'] = sex_age_cancer_dict[patient_id][2]
        else: 
            obs_metrics['sex'] = 'unknown'
            obs_metrics['age'] = 'unknown'
            obs_metrics['cancer_type'] = 'neuroblastoma'
            obs_metrics['disease_progression'] = 'healthy_donor'

        # Load the cell annotation
        if patient_id in tumor:
            obs_metrics['matched_cell_id'] = [patient_id + '_' + x for x in obs_metrics.index.tolist()]
            obs_metrics.set_index('matched_cell_id', drop=True, inplace=True)
            obs_metrics = obs_metrics.merge(tumor_annotation[['celltype']], how='left', left_index=True, right_index=True)
            obs_metrics.rename(columns={'celltype': 'cell_type_from_paper'}, inplace=True)
        elif patient_id in embryo:
            obs_metrics = obs_metrics.merge(embryo_annotation[['annotation']], how='left', left_index=True, right_index=True)
            obs_metrics.rename(columns={'annotation': 'cell_type_from_paper'}, inplace=True)
        else:
            obs_metrics['matched_cell_id'] = [x + adrenal_cell_suffix[patient_id] for x in obs_metrics.index.tolist()]
            obs_metrics.set_index('matched_cell_id', drop=True, inplace=True)
            obs_metrics = obs_metrics.merge(adrenal_annotation[['annotation']], how='left', left_index=True, right_index=True)
            obs_metrics.rename(columns={'annotation': 'cell_type_from_paper'}, inplace=True)

        obs_metrics['cell_type_from_paper'] = obs_metrics['cell_type_from_paper'].astype(str)
        obs_metrics['sample_barcode'] = file.stem.split('_gene_cell_exprs_table.xls')[0]
        obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)

        sample.obs = obs_metrics
        sample.obs.set_index("uni_barcode", drop=False, inplace=True)
        print(sample)

        # save the anndata object
        sample_h5ad = gsm + '_' + patient_id + '_uni.h5ad'
        output_path = write_directory / sample_h5ad
        sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 16053 × 17811
    obs: 'dataset', 'tissue', 'sex', 'age', 'cancer_type', 'cell_type_from_paper', 'sample_barcode', 'uni_barcode'
    var: 'Gene_ID'
AnnData object with n_obs × n_vars = 13569 × 33538
    obs: 'dataset', 'tissue', 'sex', 'age', 'cancer_type', 'cell_type_from_paper', 'sample_barcode', 'uni_barcode'


: 

: 

: 

In [18]:
sample.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
AC233755.2
AC233755.1
AC240274.1
AC213203.1
FAM231C


In [19]:
sample.obs

Unnamed: 0_level_0,dataset,tissue,sex,age,cancer_type,cell_type_from_paper,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE137804_T175_AAACCCAAGAGTGTTA.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_AAACCCAAGAGTGTTA.1
GSE137804_T175_AAACCCAAGCCTGGAA.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_AAACCCAAGCCTGGAA.1
GSE137804_T175_AAACCCAAGCGGACAT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_AAACCCAAGCGGACAT.1
GSE137804_T175_AAACCCAAGTCTGTAC.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_AAACCCAAGTCTGTAC.1
GSE137804_T175_AAACCCACACAATGCT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_AAACCCACACAATGCT.1
...,...,...,...,...,...,...,...,...
GSE137804_T175_TTTGTTGGTTGATCGT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_TTTGTTGGTTGATCGT.1
GSE137804_T175_TTTGTTGGTTTCACTT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_TTTGTTGGTTTCACTT.1
GSE137804_T175_TTTGTTGTCAATCGGT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_TTTGTTGTCAATCGGT.1
GSE137804_T175_TTTGTTGTCAGAACCT.1,GSE137804,adrenal_gland,male,0.06,neuroblastoma,,GSM4654670_T175,GSE137804_T175_TTTGTTGTCAGAACCT.1


In [20]:
gsm, patient_id

('GSM4654670', 'T175')

In [92]:
sample.obs[sample.obs['cell_type_from_paper'].isna()]

Unnamed: 0_level_0,dataset,tissue,sex,age,cancer_type,cell_type_from_paper,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE137804_AAACCTGAGGTGCAAC.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_AAACCTGAGGTGCAAC.1_1
GSE137804_AAACCTGAGTGTTTGC.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_AAACCTGAGTGTTTGC.1_1
GSE137804_AAACCTGCATTCGACA.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_AAACCTGCATTCGACA.1_1
GSE137804_AAACCTGGTGCCTGCA.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_AAACCTGGTGCCTGCA.1_1
GSE137804_AAACGGGAGGGAAACA.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_AAACGGGAGGGAAACA.1_1
...,...,...,...,...,...,...,...,...
GSE137804_TTTGTCACAATGTAAG.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_TTTGTCACAATGTAAG.1_1
GSE137804_TTTGTCACACTTCGAA.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_TTTGTCACACTTCGAA.1_1
GSE137804_TTTGTCAGTGACAAAT.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_TTTGTCAGTGACAAAT.1_1
GSE137804_TTTGTCAGTGAGGGTT.1_1,GSE137804,adrenal_gland,unknown,unknown,neuroblastoma,,GSM4088780_T69_gene_cell_exprs_table.xls,GSE137804_TTTGTCAGTGAGGGTT.1_1


In [36]:
tumor_annotation

Unnamed: 0_level_0,sample,celltype
matched_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AAGACCTAGTACTTGC.1,T10,tumor
TTTACTGCATGTAGTC.1,T10,tumor
AGAGCGAAGTTGAGTA.1,T10,tumor
CACCTTGAGTGTGGCA.1,T10,tumor
GAAATGAGTGGTACAG.1,T10,tumor
...,...,...
TGTAACGTCCTAAGTG-1,T230,Tcell
TTCATGTGTGATAGTA-1,T230,Tcell
TTCGGTCAGTCGAAAT-1,T230,Tcell
TTCTTCCAGGGAGTTC-1,T230,Tcell


In [250]:
input

Unnamed: 0_level_0,AAACCTGAGACGCAAC.1,AAACCTGAGACGCACA.1,AAACCTGAGACTACAA.1,AAACCTGAGAGAGCTC.1,AAACCTGAGCAGACTG.1,AAACCTGAGCCCGAAA.1,AAACCTGAGTACGCGA.1,AAACCTGAGTATTGGA.1,AAACCTGCACAGGAGT.1,AAACCTGCACATCCGG.1,...,TTTGTCAGTAGAGTGC.1,TTTGTCAGTCTCGTTC.1,TTTGTCAGTGCATCTA.1,TTTGTCAGTGCTAGCC.1,TTTGTCATCAAGCCTA.1,TTTGTCATCACATACG.1,TTTGTCATCCAAACTG.1,TTTGTCATCCTGCAGG.1,TTTGTCATCGTTTATC.1,TTTGTCATCTTACCTA.1
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,1,1,0,1
DPM1,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0,2,1,2,1
SCYL3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1orf112,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
FGR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RP1-34B20.21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-511P7.5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-736N17.11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-11M20.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE192906_A1_1_0314_AACGAGGT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_AACGAGGT
GSE192906_A1_1_0314_AAGCACAT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_AAGCACAT
GSE192906_A1_1_0314_ACACCGTG,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACACCGTG
GSE192906_A1_1_0314_ACCTCAGC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACCTCAGC
GSE192906_A1_1_0314_ACTGTTTG,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_A1_1_0314_ACTGTTTG
...,...,...,...,...,...
GSE192906_K2_2_0314_TCTCACAC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TCTCACAC
GSE192906_K2_2_0314_TGGAGCTC,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TGGAGCTC
GSE192906_K2_2_0314_TGTACCAA,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TGTACCAA
GSE192906_K2_2_0314_TTACGGGT,Ganglioneuroblastoma,GSE192906,Neuronal,GSM5768750_GNB3_UMI_COUNTS_RAW,GSE192906_K2_2_0314_TTACGGGT


### 3. Confirmation of created AnnData object

In [13]:
output = '/scratch/user/uqjsaxo1/xiaohan-john-project/write/GSE192906/GSM5768743_NB1_UMI_COUNTS_RAW_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 960 × 33514
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_symbols'


### 4. Convert AnnData objects to SingleCellExperiment objects

In [14]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE102130')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))