### 1. General info of dataset GSE70630

This is the Jupyter Notebook for dataset GSE70630. Its dataset includes an overall big txt file. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate an overall AnnData object for all samples. 



In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [2]:
# inspect the dataset
path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE70630/GSE70630_OG_processed_data_v2.txt'
input = pd.read_csv(path, sep='\t', index_col=0) # the first column contains gene names and is the index

print(input.head()) 
print(input.shape) # (23686 rows, 4347 columns)

            MGH36_P6_A12  MGH36_P6_H09  MGH53_P4_G04  MGH36_P10_G12  \
'A1BG'            0.0000         0.000        0.0000         0.0000   
'A1BG-AS1'        0.0000         0.000        0.0000         0.0000   
'A1CF'            0.0000         0.000        0.0000         0.0000   
'A2M'             5.7056         4.437        8.0276         5.6288   
'A2M-AS1'         0.0000         0.000        4.5347         0.0000   

            MGH53_P2_H12  MGH53_P4_D10  MGH53_P4_D01  MGH36_P6_B07  \
'A1BG'           0.00000        0.0000        0.0000        0.0000   
'A1BG-AS1'       0.00000        0.0000        0.0000        0.0000   
'A1CF'           0.02148        0.0000        0.0000        0.0000   
'A2M'            0.00000        3.3367        8.7811        8.3271   
'A2M-AS1'        0.00000        0.0000        0.0000        3.3361   

            MGH36_P10_B12  MGH53_P2_G11  ...  93_P10_H06  93_P8_B12  \
'A1BG'            0.00000       0.00000  ...         0.0        0.0   
'A1BG-AS1'

<span style="color:red">**PROBLEM:**</span> the gene names are stored as 'GENE_SYMBOL' (with a quotation mark)

In [3]:
# Get rid of the extra quotation marks for gene symbols
input.index = [gene[1:-1] for gene in input.index]
print(input.head()) 

          MGH36_P6_A12  MGH36_P6_H09  MGH53_P4_G04  MGH36_P10_G12  \
A1BG            0.0000         0.000        0.0000         0.0000   
A1BG-AS1        0.0000         0.000        0.0000         0.0000   
A1CF            0.0000         0.000        0.0000         0.0000   
A2M             5.7056         4.437        8.0276         5.6288   
A2M-AS1         0.0000         0.000        4.5347         0.0000   

          MGH53_P2_H12  MGH53_P4_D10  MGH53_P4_D01  MGH36_P6_B07  \
A1BG           0.00000        0.0000        0.0000        0.0000   
A1BG-AS1       0.00000        0.0000        0.0000        0.0000   
A1CF           0.02148        0.0000        0.0000        0.0000   
A2M            0.00000        3.3367        8.7811        8.3271   
A2M-AS1        0.00000        0.0000        0.0000        3.3361   

          MGH36_P10_B12  MGH53_P2_G11  ...  93_P10_H06  93_P8_B12  93_P8_D09  \
A1BG            0.00000       0.00000  ...         0.0        0.0        0.0   
A1BG-AS1        

### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [6]:
matrix = scipy.sparse.csr_matrix(input.values.T)
obs_name = pd.DataFrame(index=input.columns)
var_name = pd.DataFrame(input.index, columns=['gene_symbols'])

sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)

# Create an observation metric info to store related features
obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers

obs_metrics['cancer_type'] = 'IDH-Mutation_glioma'
obs_metrics['dataset'] = 'GSE70630'
obs_metrics['tissue'] = 'brain'
obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics.index.astype(str)

# Add the obs_metrics to the AnnData object
sample.obs = obs_metrics
sample.obs.set_index("uni_barcode", drop=False, inplace=True)
print(sample.obs)

# save the anndata object
sample.write_h5ad('/scratch/user/s4543064/xiaohan-john-project/write/GSE70630/GSE70630_OG_processed_data_v2_uni.h5ad', compression="gzip")



                                cancer_type   dataset tissue  \
uni_barcode                                                    
GSE70630_MGH36_P6_A12   IDH-Mutation_glioma  GSE70630  brain   
GSE70630_MGH36_P6_H09   IDH-Mutation_glioma  GSE70630  brain   
GSE70630_MGH53_P4_G04   IDH-Mutation_glioma  GSE70630  brain   
GSE70630_MGH36_P10_G12  IDH-Mutation_glioma  GSE70630  brain   
GSE70630_MGH53_P2_H12   IDH-Mutation_glioma  GSE70630  brain   
...                                     ...       ...    ...   
GSE70630_93_P8_H06      IDH-Mutation_glioma  GSE70630  brain   
GSE70630_93_P9_C07      IDH-Mutation_glioma  GSE70630  brain   
GSE70630_93_P8_A12      IDH-Mutation_glioma  GSE70630  brain   
GSE70630_93_P8_C01      IDH-Mutation_glioma  GSE70630  brain   
GSE70630_93_P9_F06      IDH-Mutation_glioma  GSE70630  brain   

                                   uni_barcode  
uni_barcode                                     
GSE70630_MGH36_P6_A12    GSE70630_MGH36_P6_A12  
GSE70630_MGH36_P6_H0

### 3. Confirmation of created AnnData object

In [16]:
output = '/scratch/user/s4543064/xiaohan-john-project/write/GSE70630/GSE70630_OG_processed_data_v2_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 4347 × 23686
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode', 'sample_barcode'
    var: 'gene_symbols'


In [3]:
sample.obs['sample_barcode'] = 'GSE70630'
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,uni_barcode,sample_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE70630_MGH36_P6_A12,IDH-Mutation_glioma,GSE70630,brain,GSE70630_MGH36_P6_A12,GSE70630
GSE70630_MGH36_P6_H09,IDH-Mutation_glioma,GSE70630,brain,GSE70630_MGH36_P6_H09,GSE70630
GSE70630_MGH53_P4_G04,IDH-Mutation_glioma,GSE70630,brain,GSE70630_MGH53_P4_G04,GSE70630
GSE70630_MGH36_P10_G12,IDH-Mutation_glioma,GSE70630,brain,GSE70630_MGH36_P10_G12,GSE70630
GSE70630_MGH53_P2_H12,IDH-Mutation_glioma,GSE70630,brain,GSE70630_MGH53_P2_H12,GSE70630
...,...,...,...,...,...
GSE70630_93_P8_H06,IDH-Mutation_glioma,GSE70630,brain,GSE70630_93_P8_H06,GSE70630
GSE70630_93_P9_C07,IDH-Mutation_glioma,GSE70630,brain,GSE70630_93_P9_C07,GSE70630
GSE70630_93_P8_A12,IDH-Mutation_glioma,GSE70630,brain,GSE70630_93_P8_A12,GSE70630
GSE70630_93_P8_C01,IDH-Mutation_glioma,GSE70630,brain,GSE70630_93_P8_C01,GSE70630


In [4]:
# save the anndata object
sample.write_h5ad('/scratch/user/s4543064/xiaohan-john-project/write/GSE70630/GSE70630_OG_processed_data_v2_uni.h5ad', compression="gzip")

### 4. Convert AnnData objects to SingleCellExperiment objects

In [5]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE70630')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        # print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

In [6]:
print(sample_sce)

class: SingleCellExperiment 
dim: 23686 4347 
metadata(0):
assays(1): X
rownames(23686): 0 1 ... 23684 23685
rowData names(1): gene_symbols
colnames(4347): GSE70630_MGH36_P6_A12 GSE70630_MGH36_P6_H09 ...
  GSE70630_93_P8_C01 GSE70630_93_P9_F06
colData names(5): cancer_type dataset tissue uni_barcode sample_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

