### 1. General info of dataset GSE155446

This is the Jupyter Notebook for dataset GSE155446. Its dataset includes an overall big cell metadata and raw count file. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate an overall AnnData object for all samples. 

In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [2]:
# inspect the dataset
path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE155446/GSE155446_human_raw_counts.csv.gz'
input = pd.read_csv(path, sep=',', index_col=0) # the first column contains gene names and is the index

print(input.head()) 
print(input.shape) # (26841, 39946)

             753_AAACGAAGTCTTGGTA  753_AAACGCTTCCATGATG  753_AAACGCTTCGAGTGGA  \
gene                                                                            
MIR1302-2HG                     0                     0                     0   
OR4F5                           0                     0                     0   
AL627309.1                      0                     0                     0   
AL627309.3                      0                     0                     0   
AL732372.1                      0                     0                     0   

             753_AAAGAACCAAATGGTA  753_AAAGAACTCATTCATC  753_AAAGGGCCAGTAGTTC  \
gene                                                                            
MIR1302-2HG                     0                     0                     0   
OR4F5                           0                     0                     0   
AL627309.1                      0                     0                     0   
AL627309.3                 

As shown above, the dataset contains 39946 cells and 26841 genes.

In [5]:
# inspect the metadata
meta_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE155446/GSE155446_human_cell_metadata.csv.gz'
meta = pd.read_csv(meta_path, sep=',', index_col=0) 

print(meta) 

                       subgroup  UPN additional_info                fastq_id  \
cell                                                                           
753_AAACGAAGTCTTGGTA        GP4  753             NaN  Foreman_753_03142019_1   
753_AAACGCTTCCATGATG        GP4  753             NaN  Foreman_753_03142019_1   
753_AAACGCTTCGAGTGGA        GP4  753             NaN  Foreman_753_03142019_1   
753_AAAGAACCAAATGGTA        GP4  753             NaN  Foreman_753_03142019_1   
753_AAAGAACTCATTCATC        GP4  753             NaN  Foreman_753_03142019_1   
...                         ...  ...             ...                     ...   
966-2_TTTGGAGGTATGAGAT      GP4  966      recurrance                   966_2   
966-2_TTTGGTTCATACAGCT      GP4  966      recurrance                   966_2   
966-2_TTTGGTTGTCCGTACG      GP4  966      recurrance                   966_2   
966-2_TTTGTTGTCGCTGACG      GP4  966      recurrance                   966_2   
966-2_TTTGTTGTCGTCGATA      GP4  966    

In [4]:
meta.columns

Index(['subgroup', 'UPN', 'additional_info', 'fastq_id', 'coarse_cell_type',
       'tumor_subpopulation', 'geo_sample_id'],
      dtype='object')

In [7]:
print("subgroup: ", meta['subgroup'].unique())
print("UPN: ", meta['UPN'].unique())
print("additional_info: ", meta['additional_info'].unique())
print("coarse_cell_type: ", meta['coarse_cell_type'].unique())
print("tumor_subpopulation: ", meta['tumor_subpopulation'].unique())
print("geo_sample_id: ", meta['geo_sample_id'].unique())

subgroup:  ['GP4' 'SHH' 'GP3' 'WNT' 'GP3/4']
UPN:  [ 753  801  831  877  898  925  934  945  966  996 1028 1066 1070 1125
 1128 1130 1155 1167 1177 1195 1224 1235 1238 1325 1355 1397 1416 1433]
additional_info:  [nan 'repeated_sample' 'primary' 'recurrance']
coarse_cell_type:  ['malignant' 'lymphocytes' 'oligodendrocytes_astrocytes_other'
 'macrophage_monocytes']
tumor_subpopulation:  ['GP4-B1' 'GP4-C1' 'GP4-B2' 'GP4-A2' 'GP4-A1' 'GP4-X1' 'GP4-C2' nan
 'SHH-C1' 'SHH-A1' 'SHH-B1' 'SHH-B2' 'SHH-A2' 'SHH-C2' 'SHH-X1' 'GP3-B1'
 'GP3-A' 'GP3-C1' 'GP3-C2' 'GP3-B2' 'WNT' 'GP4-X2' 'GP4-X3']
geo_sample_id:  ['753' '801' '831' '877' '898' '925' '934' '934-repeat' '945' '966' '996'
 '1028' '1066' '1070' '1125' '1128' '1130' '1155' '1167' '1177' '1195'
 '1224' '1235' '1238' '1325' '1355' '1397' '1416' '1433' '966-recurrence']


In [9]:
len(meta['UPN'].unique())

28

In [10]:
len(meta['geo_sample_id'].unique())

30

### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [17]:
matrix = scipy.sparse.csr_matrix(input.values.T)
obs_name = pd.DataFrame(index=input.columns)
var_name = pd.DataFrame(input.index)
var_name.rename(columns={'Gene': 'gene_symbols'}, inplace=True)

sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)
print(sample)

# Create an observation metric info to store related features
obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers

# the clinical features from the paper
age_sex_dict = {'753': [4, 'male'], '801': [4, 'male'], '831': [1, 'male'], '877': [3, 'female'], '898': [2, 'male'], '925': [3, 'male'], '934': [12, 'male'], 
                '934-repeat': [12, 'male'], '945': [3, 'female'], '966': [6, 'female'], '996': [13, 'male'],'1028': [7, 'female'], '1066': [9, 'male'], '1070': [12, 'female'], 
                '1125': [3, 'male'], '1128': [15, 'female'], '1130': [3, "male"], '1155': [6, 'male'], '1167': [12, 'female'], '1177': [10, 'male'], '1195': [10, 'male'], 
                '1224': [1, 'male'], '1235': [4, 'female'], '1238': [7, 'female'], '1325': [0.4, 'female'], '1355': [6, 'male'], '1397': [11, 'male'], '1416': [4, 'male'], 
                '1433': [3, 'male'], '966-recurrence': [6, 'female']}

# Load the cell metadate
obs_metrics = obs_metrics.merge(meta, how='left', left_index=True, right_index=True)

obs_metrics['age'] = obs_metrics['geo_sample_id'].apply(lambda x: age_sex_dict[x][0])
obs_metrics['sex'] = obs_metrics['geo_sample_id'].apply(lambda x: age_sex_dict[x][1])

obs_metrics['cancer_type'] = obs_metrics['subgroup'] + '_' + 'medulloblastoma'
obs_metrics['dataset'] = 'GSE155446'
obs_metrics['tissue'] = 'brain'
obs_metrics['sample_barcode'] = 'GSE155446_' + obs_metrics['geo_sample_id']
obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)

obs_metrics = obs_metrics.rename(columns={'additional_info': 'disease_progression', 'coarse_cell_type': 'cell_type_from_paper'})
obs_metrics.drop(columns={'subgroup', 'UPN', 'fastq_id', 'tumor_subpopulation', 'geo_sample_id'}, inplace=True)

sample.obs = obs_metrics
sample.obs.set_index("uni_barcode", drop=False, inplace=True)
print(sample)

# save the anndata object
sample.write_h5ad('/scratch/user/s4543064/xiaohan-john-project/write/GSE155446/GSE155446_uni.h5ad', compression="gzip")



AnnData object with n_obs × n_vars = 39946 × 26841
    var: 'gene'
AnnData object with n_obs × n_vars = 39946 × 26841
    obs: 'disease_progression', 'cell_type_from_paper', 'age', 'sex', 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene'


In [18]:
sample.var

Unnamed: 0,gene
0,MIR1302-2HG
1,OR4F5
2,AL627309.1
3,AL627309.3
4,AL732372.1
...,...
26836,AC007325.2
26837,AL354822.1
26838,AC023491.2
26839,AC004556.1


In [19]:
sample.obs

Unnamed: 0_level_0,disease_progression,cell_type_from_paper,age,sex,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSE155446_753_AAACGAAGTCTTGGTA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGAAGTCTTGGTA
GSE155446_753_AAACGCTTCCATGATG,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGCTTCCATGATG
GSE155446_753_AAACGCTTCGAGTGGA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGCTTCGAGTGGA
GSE155446_753_AAAGAACCAAATGGTA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAAGAACCAAATGGTA
GSE155446_753_AAAGAACTCATTCATC,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAAGAACTCATTCATC
...,...,...,...,...,...,...,...,...,...
GSE155446_966-2_TTTGGAGGTATGAGAT,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGAGGTATGAGAT
GSE155446_966-2_TTTGGTTCATACAGCT,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGTTCATACAGCT
GSE155446_966-2_TTTGGTTGTCCGTACG,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGTTGTCCGTACG
GSE155446_966-2_TTTGTTGTCGCTGACG,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGTTGTCGCTGACG


### 3. Confirmation of created AnnData object

In [2]:
output = '/scratch/user/s4543064/xiaohan-john-project/write/GSE155446/GSE155446_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 39946 × 26841
    obs: 'disease_progression', 'cell_type_from_paper', 'age', 'sex', 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene'


In [3]:
sample.var.set_index('gene', drop=True, inplace=True)
sample.var.rename_axis('gene_symbols', inplace=True)

sample

AnnData object with n_obs × n_vars = 39946 × 26841
    obs: 'disease_progression', 'cell_type_from_paper', 'age', 'sex', 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'

In [6]:
sample.write_h5ad(output, compression="gzip")

In [4]:
sample.var

MIR1302-2HG
OR4F5
AL627309.1
AL627309.3
AL732372.1
...
AC007325.2
AL354822.1
AC023491.2
AC004556.1
AC240274.1


In [5]:
sample.obs

Unnamed: 0_level_0,disease_progression,cell_type_from_paper,age,sex,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSE155446_753_AAACGAAGTCTTGGTA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGAAGTCTTGGTA
GSE155446_753_AAACGCTTCCATGATG,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGCTTCCATGATG
GSE155446_753_AAACGCTTCGAGTGGA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAACGCTTCGAGTGGA
GSE155446_753_AAAGAACCAAATGGTA,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAAGAACCAAATGGTA
GSE155446_753_AAAGAACTCATTCATC,,malignant,4.0,male,GP4_medulloblastoma,GSE155446,brain,GSE155446_753,GSE155446_753_AAAGAACTCATTCATC
...,...,...,...,...,...,...,...,...,...
GSE155446_966-2_TTTGGAGGTATGAGAT,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGAGGTATGAGAT
GSE155446_966-2_TTTGGTTCATACAGCT,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGTTCATACAGCT
GSE155446_966-2_TTTGGTTGTCCGTACG,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGGTTGTCCGTACG
GSE155446_966-2_TTTGTTGTCGCTGACG,recurrance,malignant,6.0,female,GP4_medulloblastoma,GSE155446,brain,GSE155446_966-recurrence,GSE155446_966-2_TTTGTTGTCGCTGACG


### 4. Convert AnnData objects to SingleCellExperiment objects

In [7]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE155446')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_anndata.X = sample_anndata.X.astype('float32')
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

class: SingleCellExperiment 
dim: 26841 39946 
metadata(0):
assays(1): X
rownames(26841): MIR1302-2HG OR4F5 ... AC004556.1 AC240274.1
rowData names(0):
colnames(39946): GSE155446_753_AAACGAAGTCTTGGTA
  GSE155446_753_AAACGCTTCCATGATG ... GSE155446_966-2_TTTGTTGTCGCTGACG
  GSE155446_966-2_TTTGTTGTCGTCGATA
colData names(9): disease_progression cell_type_from_paper ...
  sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

