This notebook takes gene expression data along with metadata and converts it to h5ad file that will be used in Geneformer tokenizer to creat the dataset used for Genformer inference. Gene names are converted to Ensembl IDs via Ensembl Biomart, the set of gene neames and ensemble IDs is saved in 'mart_export.txt'. 

In [1]:
import scipy.sparse as sparse
import anndata as ad
import scanpy as sc
from geneformerCopy import TranscriptomeTokenizer
import pandas as pd
import numpy as np

In [2]:
input_file='C:/Users/zainsamadi/Downloads/Endothelium.h5ad'

In [3]:
adata=sc.read(input_file)

In [4]:
adata

AnnData object with n_obs × n_vars = 73195 × 61759
    obs: 'donor_id', 'tissue_in_publication', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'assay_ontology_term_id', 'sample_id', 'replicate', '10X_run', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_type_ontology_term_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'ethnicity_original', 'scvi_leiden_res05_compartment', 'sample_number', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'disease_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 

In [5]:
adata.obs['tissue'].unique()

['lymph node', 'thymus', 'liver', 'heart', 'lung', ..., 'buccal mucosa', 'cornea', 'jejunum', 'endocrine pancreas', 'eyelid']
Length: 69
Categories (69, object): ['endocrine pancreas', 'exocrine pancreas', 'lymph node', 'large intestine', ..., 'anterior part of tongue', 'posterior part of tongue', 'bladder organ', 'chorioretinal region']

In [6]:
adata.var['feature_reference'].keys()

Index(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       'ENSG00000001167',
       ...
       'ENSG00000290126', 'ENSG00000290127', 'ENSG00000290146',
       'ENSG00000290147', 'ENSG00000290149', 'ENSG00000290162',
       'ENSG00000290163', 'ENSG00000290164', 'ENSG00000290165',
       'ENSG00000290166'],
      dtype='object', name='ensg', length=61759)

In [7]:
sc.pp.filter_cells(adata, min_genes=5)

In [8]:
adatan=ad.AnnData(adata.X)

In [9]:

# ctypes=expr_df['cellType'].values
filter_pass=[0 if typesi=='unknown' else 1 for typesi in adata.obs['tissue']] # these cells are flagged to be discarded in geneformer 

In [10]:
np.max(adata.X)

np.float32(8.859213)

In [11]:

adatan.obs['filter_pass']=filter_pass



In [12]:
adatan.obs["cell_type"]=adata.obs['free_annotation'].values
adatan.obs["organ"]=adata.obs['tissue'].values
adatan.obs["n_counts"]=adata.obs['total_counts'].values
adatan.var["ensembl_id"]=adata.var['feature_reference'].keys()



In [13]:
np.max(adatan.obs['n_counts'])

np.float64(28981910.0)

In [14]:
len(adatan)

73195

In [15]:
adatan

AnnData object with n_obs × n_vars = 73195 × 61759
    obs: 'filter_pass', 'cell_type', 'organ', 'n_counts'
    var: 'ensembl_id'

In [16]:
adatan.var['ensembl_id']

0        ENSG00000000003
1        ENSG00000000005
2        ENSG00000000419
3        ENSG00000000457
4        ENSG00000000460
              ...       
61754    ENSG00000290162
61755    ENSG00000290163
61756    ENSG00000290164
61757    ENSG00000290165
61758    ENSG00000290166
Name: ensembl_id, Length: 61759, dtype: object

In [17]:
sc.write("endo_gex.h5ad", adatan)

In [18]:
adatan

AnnData object with n_obs × n_vars = 73195 × 61759
    obs: 'filter_pass', 'cell_type', 'organ', 'n_counts'
    var: 'ensembl_id'

In [21]:
# These are default 95M files
# gene_median_file=GENE_MEDIAN_FILE,
# token_dictionary_file=TOKEN_DICTIONARY_FILE,
# gene_mapping_file=ENSEMBL_MAPPING_FILE,
# tk = TranscriptomeTokenizerz({"cell_type": "cell_type", "organ": "organ"},nproc=16)
# tokenized_datasets=tk.tokenize_data("C:/Users/zainsamadi/pyprojects/Geneformer", 
#                  "endo_gex_tokens", 
#                  "endo", 
#                  file_format="h5ad", adatain=adatan)


tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ": "organ"},nproc=1)
tokenized_datasets=tk.tokenize_data("C:/Users/zainsamadi/pyprojects/Geneformer/examples", 
                 "endo_gex_tokens", 
                 "endo", 
                 file_format="h5ad")

Tokenizing C:\Users\zainsamadi\pyprojects\Geneformer\examples\endo_gex.h5ad


100%|████████████████████████████████████████████████████████████████████████████████| 143/143 [01:05<00:00,  2.17it/s]
  getattr(self, attr).index = value
  utils.warn_names_duplicates("var")
  for i in adata.var["ensembl_id_collapsed"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id_collapsed"][coding_miRNA_loc]


Creating dataset.
