### 1. General info of dataset GSE243347

This is the Jupyter Notebook for dataset GSE243347. Its dataset includes a txt file for each sample and an overall data annotation txt file. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate an AnnData object for each sample. Overall, there are 27 samples. 



In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [23]:
# inspect the dataset
path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE243347/GSM7783809_TM770.transcripts.txt.gz'
input = pd.read_csv(path, sep='\t', index_col=0) # the first column contains gene names and is the index

print(input) 
print(input.shape) # (23686 rows, 4058 columns)

                             UNK   A1    A2    A3    A4    A5    A6    A7  \
GENEID                                                                      
ENSG00000000003__TSPAN6      4.0  0.0   0.0   0.0   0.0   1.0   0.0   0.0   
ENSG00000000005__TNMD        0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
ENSG00000000419__DPM1        6.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
ENSG00000000457__SCYL3       3.0  0.0   4.0   0.0   0.0   0.0   1.0   0.0   
ENSG00000000460__C1orf112    1.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
...                          ...  ...   ...   ...   ...   ...   ...   ...   
ERCC-00164                   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
ERCC-00165                  10.0  0.0   0.0   1.0   0.0   0.0   0.0   0.0   
ERCC-00168                   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
ERCC-00170                   3.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0   
ERCC-00171                 646.4  5.0  28.1  21.1  18.0  23.1  35.2  37.2   

In [4]:
input.values

array([[  4. ,   0. ,   0. , ...,   0. ,   1. ,   0. ],
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [  6. ,   0. ,   0. , ...,   0. ,   3. ,   0. ],
       ...,
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [  3. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [646.4,   5. ,  28.1, ...,  10. ,  28.1,  32.1]])

In [68]:
# inspect the data annotation file
annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE243347/GSE243347_data_annotation.txt.gz'
annotation = pd.read_csv(annotation_path, sep='\t', index_col=1)

annotation

Unnamed: 0_level_0,plate_id,empty_wells,selected_wells,sample_id,patient_id,tumor_type,sample_type,sample_treatment_timepoint,sample_body_site,fusion_status,fusion_type,sort,sample_processing_sort,Note
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TM338.transcripts.txt,TM338,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-024,ES-024,Ewing,Resection,Post,Femur,Pos,EWS-FLI1,Live,Fresh,
TM339.transcripts.txt,TM339,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-024,ES-024,Ewing,Resection,Post,Femur,Pos,EWS-FLI1,Live,Fresh,
TM344.transcripts.txt,TM344,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-026,ES-016,Ewing,Resection,Post,Femur,Pos,EWS-FLI1,Live,Fresh,
TM348.transcripts.txt,TM348,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-026,ES-016,Ewing,Resection,Post,Femur,Pos,EWS-FLI1,Live,Fresh,
TM416.transcripts.txt,TM416,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-025,ES-025,Ewing,Biopsy,Pre,Fibula,Pos,EWS-FLI1,Live,Fresh,
TM417.transcripts.txt,TM417,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-025,ES-025,Ewing,Biopsy,Pre,Fibula,Pos,EWS-FLI1,Live,Fresh,
TM424.transcripts.txt,TM424,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-016-biopsy,ES-016,Ewing,Biopsy,Pre,Femur,Pos,EWS-FLI1,CD45-,Cryopreserved,
TM425.transcripts.txt,TM425,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-016-meta,ES-016,Ewing,Metastasis,Pre,Lymph node,Pos,EWS-FLI1,CD45-,Cryopreserved,
TM505.transcripts.txt,TM505,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-027,ES-027,Ewing,Biopsy,Pre,Fibula and femur,Pos,EWS-FLI1,CD45-,Fresh,
TM506.transcripts.txt,TM506,"A1,B1,C1,D1,E1,F1,G1,H1,I1,J1,K1,L1,M1,N1,O1,P1",,ES-027,ES-027,Ewing,Biopsy,Pre,Fibula and femur,Pos,EWS-FLI1,Live,Fresh,


In [69]:
annotation.shape # but we only have 27 samples

(28, 14)

In [75]:
duplicated_rows = annotation[annotation.index.duplicated(keep=False)]
duplicated_rows[['sample_id', 'sample_body_site']]

Unnamed: 0_level_0,sample_id,sample_body_site
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
TM736.transcripts.txt,ES-039,Rib
TM736.transcripts.txt,ES-030,Tibia


In [8]:
annotation.columns

Index(['plate_id', 'file_name', 'empty_wells', 'selected_wells', 'sample_id',
       'patient_id', 'tumor_type', 'sample_type', 'sample_treatment_timepoint',
       'sample_body_site', 'fusion_status', 'fusion_type', 'sort',
       'sample_processing_sort', 'Note'],
      dtype='object')

In [45]:
useful_columns = ['sample_id', 'tumor_type', 'sample_body_site']
annotation = annotation[useful_columns]

annotation

Unnamed: 0_level_0,sample_id,tumor_type,sample_body_site
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TM338.transcripts.txt,ES-024,Ewing,Femur
TM339.transcripts.txt,ES-024,Ewing,Femur
TM344.transcripts.txt,ES-026,Ewing,Femur
TM348.transcripts.txt,ES-026,Ewing,Femur
TM416.transcripts.txt,ES-025,Ewing,Fibula
TM417.transcripts.txt,ES-025,Ewing,Fibula
TM424.transcripts.txt,ES-016-biopsy,Ewing,Femur
TM425.transcripts.txt,ES-016-meta,Ewing,Lymph node
TM505.transcripts.txt,ES-027,Ewing,Fibula and femur
TM506.transcripts.txt,ES-027,Ewing,Fibula and femur


### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [96]:
import warnings
from pathlib import Path

# Specify directory paths
data_directory = Path('/scratch/user/s4543064/xiaohan-john-project/data/GSE243347')
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE243347')

# Loop through all files in the directory
for sample_path in data_directory.iterdir():
    if 'transcripts.txt.gz' in sample_path.name:
        gsm_code, file_name = sample_path.stem.split('_')[:2]
        sample_h5ad = sample_path.stem.split('.transcripts.txt')[0] + '_uni.h5ad'
        
        input = pd.read_csv(sample_path, sep='\t', index_col=0) # the first column contains gene names and is the index
        gene_ids = [x.split('__')[0] if '__' in x else np.nan for x in input.index.tolist()]
        gene_symbols = [x.split('__')[1] if '__' in x else x.split('__')[0] for x in input.index.tolist()]

        matrix = scipy.sparse.csr_matrix(input.values.T)
        obs_name = pd.DataFrame(index=input.columns)
        var_name = pd.DataFrame({'gene_ids': gene_ids}, index=gene_symbols)
        var_name.rename_axis('gene_symbols', inplace=True)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)

        # Create an observation metric info to store related features
        obs_metrics = pd.DataFrame(index=sample.obs_names) ## Get the identifiers
        obs_metrics['cancer_type'] = 'Ewing_sarcoma'
        obs_metrics['dataset'] = 'GSE243347'

        if file_name == 'TM736.transcripts.txt':
            tissue = '/'.join(annotation.loc[file_name, 'sample_body_site'])
            sample_barcode = '/'.join(annotation.loc[file_name, 'sample_id'])

            obs_metrics['tissue'] = tissue
            obs_metrics['sample_barcode'] = sample_barcode
        else:
            obs_metrics['tissue'] = annotation.loc[file_name, 'sample_body_site']
            obs_metrics['sample_barcode'] = gsm_code + '_' + annotation.loc[file_name, 'sample_id']

        obs_metrics['uni_barcode'] = obs_metrics['dataset'] + '_' + obs_metrics['sample_barcode'] + '_' + obs_metrics.index.astype(str)

        sample.obs = obs_metrics
        sample.obs.set_index("uni_barcode", drop=False, inplace=True)
        print(sample)

        # save the anndata object
        output_path = write_directory / sample_h5ad
        sample.write_h5ad(output_path, compression="gzip")

AnnData object with n_obs × n_vars = 385 × 22190
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 21691
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 25662
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 24028
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 21867
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 23379
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'
AnnData object with n_obs × n_vars = 385 × 26740
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode

In [97]:
sample.var

Unnamed: 0_level_0,gene_ids
gene_symbols,Unnamed: 1_level_1
TSPAN6,ENSG00000000003
TNMD,ENSG00000000005
DPM1,ENSG00000000419
SCYL3,ENSG00000000457
C1orf112,ENSG00000000460
...,...
ERCC-00164,
ERCC-00165,
ERCC-00168,
ERCC-00170,


In [98]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE243347_GSM7783799_ES-006-L2_UNK,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_UNK
GSE243347_GSM7783799_ES-006-L2_A1,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_A1
GSE243347_GSM7783799_ES-006-L2_A2,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_A2
GSE243347_GSM7783799_ES-006-L2_A3,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_A3
GSE243347_GSM7783799_ES-006-L2_A4,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_A4
...,...,...,...,...,...
GSE243347_GSM7783799_ES-006-L2_P20,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_P20
GSE243347_GSM7783799_ES-006-L2_P21,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_P21
GSE243347_GSM7783799_ES-006-L2_P22,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_P22
GSE243347_GSM7783799_ES-006-L2_P23,Ewing_sarcoma,GSE243347,Rib,GSM7783799_ES-006-L2,GSE243347_GSM7783799_ES-006-L2_P23


### 3. Confirmation of created AnnData object

In [99]:
output = '/scratch/user/s4543064/xiaohan-john-project/write/GSE243347/GSM7783807_TM736_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 385 × 19504
    obs: 'cancer_type', 'dataset', 'tissue', 'sample_barcode', 'uni_barcode'
    var: 'gene_ids'


  utils.warn_names_duplicates("var")


In [100]:
sample.var

Unnamed: 0_level_0,gene_ids
gene_symbols,Unnamed: 1_level_1
DPM1,ENSG00000000419
SCYL3,ENSG00000000457
C1orf112,ENSG00000000460
FGR,ENSG00000000938
CFH,ENSG00000000971
...,...
ERCC-00164,
ERCC-00165,
ERCC-00168,
ERCC-00170,


In [101]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,sample_barcode,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE243347_ES-039/ES-030_UNK,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_UNK
GSE243347_ES-039/ES-030_A1,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_A1
GSE243347_ES-039/ES-030_A2,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_A2
GSE243347_ES-039/ES-030_A3,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_A3
GSE243347_ES-039/ES-030_A4,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_A4
...,...,...,...,...,...
GSE243347_ES-039/ES-030_P20,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_P20
GSE243347_ES-039/ES-030_P21,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_P21
GSE243347_ES-039/ES-030_P22,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_P22
GSE243347_ES-039/ES-030_P23,Ewing_sarcoma,GSE243347,Rib/Tibia,ES-039/ES-030,GSE243347_ES-039/ES-030_P23


### 4. Convert AnnData objects to SingleCellExperiment objects

In [102]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE243347')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))

  utils.warn_names_duplicates("var")
       'EMG1', 'LINC01238', 'LINC01238',
       ...
       'Y_RNA', 'Y_RNA', 'Y_RNA', 'Y_RNA', 'Y_RNA', 'Y_RNA', 'Y_RNA', 'snoU13',
       'snoU13', 'snoU13'],
      dtype='object', name='gene_symbols', length=194)
  return dispatch(args[0].__class__)(*args, **kw)
  utils.warn_names_duplicates("var")
       'DGCR5', 'DNAJC9-AS1', 'DNAJC9-AS1',
       ...
       'Y_RNA', 'Y_RNA', 'Y_RNA', 'Y_RNA', 'snoU13', 'snoU13', 'snoU13',
       'uc_338', 'uc_338', 'uc_338'],
      dtype='object', name='gene_symbols', length=232)
  return dispatch(args[0].__class__)(*args, **kw)
  utils.warn_names_duplicates("var")
       'LINC01238', 'LINC01238',
       ...
       'snoU13', 'snoU13', 'snoU13', 'snoU13', 'snoU13', 'snoU13', 'snoU13',
       'uc_338', 'uc_338', 'uc_338'],
      dtype='object', name='gene_symbols', length=365)
  return dispatch(args[0].__class__)(*args, **kw)
  utils.warn_names_duplicates("var")
       'CYB561D2', 'LINC01481', 'LINC01481',
       

In [103]:
print(sample_sce)

class: SingleCellExperiment 
dim: 23642 385 
metadata(0):
assays(1): X
rownames(23642): TSPAN6 TNMD ... ERCC-00170 ERCC-00171
rowData names(1): gene_ids
colnames(385): GSE243347_GSM7783795_ES-030_UNK
  GSE243347_GSM7783795_ES-030_A1 ... GSE243347_GSM7783795_ES-030_P23
  GSE243347_GSM7783795_ES-030_P24
colData names(5): cancer_type dataset tissue sample_barcode uni_barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

