In [1]:

!pip install helical

!pip install datasets --upgrade

Collecting helical
  Downloading helical-0.0.1a16-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.29.3 (from helical)
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting anndata==0.10.7 (from helical)
  Downloading anndata-0.10.7-py3-none-any.whl.metadata (6.6 kB)
Collecting azure-core==1.30.1 (from helical)
  Downloading azure_core-1.30.1-py3-none-any.whl.metadata (37 kB)
Collecting azure-identity==1.16.0 (from helical)
  Downloading azure_identity-1.16.0-py3-none-any.whl.metadata (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-storage-blob==12.19.1 (from helical)
  Downloading azure_storage_blob-12.19.1-py3-none

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3061, in _parsed_pkg_info
    return self._pkg_info
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise 

In [1]:


from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import anndata as ad
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from scipy.sparse import lil_matrix
import torch.optim as optim
from helical.models.scgpt.model import scGPT, scGPTConfig
from helical.models.geneformer.model import Geneformer, GeneformerConfig
from copy import deepcopy
from torch.nn.functional import one_hot

INFO:datasets:PyTorch version 2.5.1+cu121 available.
INFO:datasets:Polars version 1.9.0 available.
INFO:datasets:TensorFlow version 2.17.1 available.
INFO:datasets:JAX version 0.4.33 available.
INFO:helical:Caduceus not available: If you want to use this model, ensure you have a CUDA GPU and have installed the optional helical[mamba-ssm] dependencies.


In [2]:
from datasets import load_dataset
ds = load_dataset("helical-ai/yolksac_human",trust_remote_code=True, split="train[:65%]",download_mode="reuse_cache_if_exists")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/553M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
observation_columns = [obs for obs in list(ds.features.keys()) if not obs == 'raw_counts']
obs_data = pd.DataFrame(ds.select_columns(observation_columns).data.to_pandas(),columns=observation_columns)
lil = lil_matrix((len(ds),ds[0]['size']))
lil.data = np.array(ds['raw_counts'],dtype="object")
lil.rows = np.array(ds['rows'],dtype="object")
adata = ad.AnnData(lil.tocsr(),obs=obs_data)
adata.var_names = ds.features['raw_counts'].id.split(",")
adata.var['gene_name'] = adata.var_names.str.upper()

In [4]:
# get labels: the celltype
num_types = adata.obs["LVL1"].unique().shape[0]
id2type = dict(enumerate(adata.obs["LVL1"].astype("category").cat.categories))

celltypes_labels = np.array(adata.obs["LVL1"].tolist())

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
scgpt_config = scGPTConfig(batch_size=50, device=device)
scgpt = scGPT(configurer = scgpt_config)
print(adata.var.index)


def knockout_genes(data,model,genes):
  """ Generate embeddings for the original dataset and modified dataset using
  the selected model. The modified dataset has gene expressions set to 0 for
  all genes present in the genes list.

  Inputs:
      data (anndata): dataset that is being considered
      model ()
  """
  # knockout genes using batching to reduce explosion in RAM usage
  batch_size = 100
  original_data = data.copy()
  print(len(genes))
  gene_indices = [data.var.index.get_loc(gene) for gene in genes if gene in data.var.index]
  # update data in a vectorised way all at once to reduce memory spike
  data.var['gene_name'][gene_indices] = 0
  print(data.var[0:20])
  perturbed_data = data.copy()
  new_data = model.process_data(adata=perturbed_data, gene_names="gene_name")
  x_model_dropped = model.get_embeddings(new_data)
  # generate embeddings on original data
  new_data = model.process_data(adata = original_data, gene_names = "gene_name")
  x_model = model.get_embeddings(new_data)
  return x_model, x_model_dropped
#genes_to_knockout = ['A1BG', 'A1BG-AS1', 'A1CF', 'A2M']
genes_to_knockout = list(adata.var_names[0:5])

x_modelt, x_model_droppedt = knockout_genes(adata,scgpt,genes_to_knockout)

INFO:helical.utils.downloader:Creating Folder /root/.cache/helical/models/scgpt/scGPT_CP
INFO:helical.utils.downloader:Starting to download: 'https://helicalpackage.blob.core.windows.net/helicalpackage/data/scgpt/scGPT_CP/vocab.json'
Downloading: 100%|██████████| 1.32M/1.32M [00:00<00:00, 3.40MB/s]
INFO:helical.utils.downloader:File saved to: '/root/.cache/helical/models/scgpt/scGPT_CP/vocab.json'
INFO:helical.utils.downloader:Starting to download: 'https://helicalpackage.blob.core.windows.net/helicalpackage/data/scgpt/scGPT_CP/best_model.pt'
Downloading: 100%|██████████| 208M/208M [00:03<00:00, 53.0MB/s]
INFO:helical.utils.downloader:File saved to: '/root/.cache/helical/models/scgpt/scGPT_CP/best_model.pt'
INFO:helical.models.scgpt.model:Model finished initializing.


Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2ML1-AS1',
       'A3GALT2', 'A4GALT', 'A4GNT',
       ...
       'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3',
       'bP-21264C1.2', 'bP-2189O9.3', 'hsa-mir-423'],
      dtype='object', length=37318)
5
               gene_name
A1BG                   0
A1BG-AS1               0
A1CF                   0
A2M                    0
A2M-AS1                0
A2ML1              A2ML1
A2ML1-AS1      A2ML1-AS1
A3GALT2          A3GALT2
A4GALT            A4GALT
A4GNT              A4GNT
AAAS                AAAS
AACS                AACS
AADAC              AADAC
AADACL2-AS1  AADACL2-AS1
AADAT              AADAT
AAED1              AAED1
AAGAB              AAGAB
AAK1                AAK1
AAMDC              AAMDC
AAMP                AAMP


INFO:helical.models.scgpt.model:Filtering out 11168 genes to a total of 26150 genes with an id in the scGPT vocabulary.
INFO:helical.models.scgpt.model:Inference started:
Embedding cells:   0%|          | 0/330 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(x_model)
plt.plot(x_model_dropped)
plt.show()

In [None]:
# mse
mse = 0
for i,j in zip(x_model,x_model_dropped):
  mse += (i,j)**2
  mse /= len(x_model)
print(mse)