In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import torch

Util functions

In [2]:
# Morgan fingerprints
def generate_morgan_fingerprint(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        # Convert the fingerprint to a sequence of integers
        fingerprint_seq = np.array([int(bit) for bit in fingerprint.ToBitString()])
        return fingerprint_seq
    else:
        return None

In [3]:
path_to_gene2vec = "/lustre/groups/ml01/workspace/alessandro.palma/imCPA_official/data/gene2vec/gene2vec_dim_200_iter_9.txt"
gene2vec_embs = []
genes = []

with open(path_to_gene2vec, "r") as gene2vec_file:
    for line in tqdm(gene2vec_file):
        gene, emb = line.strip("\n").strip().split("\t")
        num_emb = [eval(val) for val in emb.split(" ")]
        genes.append(gene)
        gene2vec_embs.append(num_emb)

24447it [00:15, 1560.61it/s]


In [4]:
gene2vec_df = pd.DataFrame(np.stack(gene2vec_embs, axis=0))
gene2vec_df.index = genes

Code

In [5]:
metadata_df = pd.read_csv("/lustre/groups/ml01/datasets/projects/cpg0000_alessandro/metadata/metadata_large.csv", index_col=0)

In [6]:
metadata_df_unique = metadata_df.drop_duplicates(subset="BROAD_SAMPLE").drop(columns="SAMPLE_KEY").set_index("BROAD_SAMPLE")
metadata_df_unique = metadata_df_unique.loc[np.logical_or(metadata_df_unique.GENE.isin(gene2vec_df.index),
                                                          metadata_df_unique.CPD_NAME=="DMSO")]

In [7]:
metadata_df_unique_drug = metadata_df_unique.loc[metadata_df_unique.PERT_TYPE=="Compound"]

In [8]:
metadata_df_unique_ORF = metadata_df_unique.loc[metadata_df_unique.PERT_TYPE=="ORF"]

In [9]:
metadata_df_unique_CRISPR = metadata_df_unique.loc[metadata_df_unique.PERT_TYPE=="CRISPR"]

Start with RDKIT Morgan FP + Gene2VEC

In [10]:
compound_embeddings = []

for cpd_id in metadata_df_unique_drug.index:
    fp_cpd = generate_morgan_fingerprint(metadata_df_unique_drug.loc[cpd_id, "SEQUENCE"])
    gene_cpd = metadata_df_unique_drug.loc[cpd_id, "GENE"]
    if gene_cpd == "none":
        gene2vec_cpd = np.zeros(200)
    else:
        gene2vec_cpd = gene2vec_df.loc[gene_cpd]
    features_cpd = np.concatenate([fp_cpd, gene2vec_cpd])
    compound_embeddings.append(features_cpd)

compound_embeddings = pd.DataFrame(np.stack(compound_embeddings, axis=0))
compound_embeddings.index = metadata_df_unique_drug.index

In [11]:
compound_embeddings.to_csv("/home/icb/alessandro.palma/environment/IMPA/IMPA/embeddings/cpg0000/cpd_embeddings.csv")

Hyena DNA + Gene2Vec

In [11]:
import sys
sys.path.insert(0, "/home/icb/alessandro.palma/environment/hyena-dna")
from huggingface import HyenaDNAPreTrainedModel, CharacterTokenizer
import torch

In [41]:
def initialize_hyena(type="32k"):
    if type=="1k":
        backbone = {
          "d_model": 128,
          "n_layer": 2,
          "d_inner": 512,
          "vocab_size": 12,
          "resid_dropout": 0.0,
          "embed_dropout": 0.1,
          "fused_mlp": False,
          "fused_dropout_add_ln": True,
          "residual_in_fp32": True,
          "pad_vocab_size_multiple": 8,
          "return_hidden_state": True,
          "layer": {
            "_name_": "hyena",
            "emb_dim": 5,
            "filter_order": 64,
            "local_order": 3,
            "l_max": 1026,
            "modulate": True,
            "w": 10,
            "lr": 6e-4,
            "wd": 0.0,
            "lr_pos_emb": 0.0}}
        pretrained_model_name = 'hyenadna-tiny-1k-seqlen'

    elif type=="32k":
        backbone = {"d_model": 256,
          "n_layer": 4,
          "d_inner": 1024,
          "vocab_size": 12,
          "resid_dropout": 0.0,
          "embed_dropout": 0.1,
          "fused_mlp": False,
          "fused_dropout_add_ln": True,
          "residual_in_fp32": True,
          "checkpoint_mixer": True,
          "checkpoint_mlp": True,
          "pad_vocab_size_multiple": 8,
          "return_hidden_state": True,
          "layer": {
            "_name_": "hyena",
            "emb_dim": 5,
            "filter_order": 64,
            "local_order": 3,
            "l_max": 32770,
            "modulate": True,
            "w": 10,
            "lr": 6e-4,
            "wd": 0.0,
            "lr_pos_emb": 0.0}}
        pretrained_model_name = 'hyenadna-small-32k-seqlen'    

    
    max_lengths = {
        'hyenadna-tiny-1k-seqlen': 1024,
        'hyenadna-small-32k-seqlen': 32768,
        'hyenadna-medium-160k-seqlen': 160000,
        'hyenadna-medium-450k-seqlen': 450000,  # T4 up to here
        'hyenadna-large-1m-seqlen': 1_000_000,  # only A100 (paid tier)
    }
    
    max_length = max_lengths[pretrained_model_name]  # auto selects
    
    # data settings:
    use_padding = True
    rc_aug = False  # reverse complement augmentation
    add_eos = False  # add end of sentence token
    
    # we need these for the decoder head, if using
    use_head = False
    n_classes = 2  # not used for embeddings only
    
    # you can override with your own backbone config here if you want,
    # otherwise we'll load the HF one in None
    backbone_cfg = None
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("Using device:", device)
    
    # instantiate the model (pretrained here)
    if pretrained_model_name in ['hyenadna-tiny-1k-seqlen',
                                 'hyenadna-small-32k-seqlen',
                                 'hyenadna-medium-160k-seqlen',
                                 'hyenadna-medium-450k-seqlen',
                                 'hyenadna-large-1m-seqlen']:
        # use the pretrained Huggingface wrapper instead
        model = HyenaDNAPreTrainedModel.from_pretrained(
            '/home/icb/alessandro.palma/environment/hyena-dna',
            pretrained_model_name,
            download=True,
            config=backbone,
            device=device,
            use_head=use_head,
            n_classes=n_classes,
            type=type
        )
    
    # from scratch
    elif pretrained_model_name is None:
        model = HyenaDNAModel(**backbone_cfg, use_head=use_head, n_classes=n_classes)
    
    # create tokenizer
    tokenizer = CharacterTokenizer(
        characters=['A', 'C', 'G', 'T', 'N'],  # add DNA characters, N is uncertain
        model_max_length=max_length + 2,  # to account for special tokens, like EOS
        add_special_tokens=False,  # we handle special tokens elsewhere
        padding_side='left', # since HyenaDNA is causal, we pad on the left
    )
    return model, tokenizer

def encode_list(model, tokenizer, sequence, device="cuda"):
    tok_seq = tokenizer(sequence)
    tok_seq = tok_seq["input_ids"]  # grab ids
    
    # place on device, convert to tensor
    tok_seq = torch.LongTensor(tok_seq).unsqueeze(0)  # unsqueeze for batch dim
    tok_seq = tok_seq.to(device)
    
    # prep model and forward
    model.to(device)
    model.eval()
    with torch.inference_mode():
        embeddings = model(tok_seq)
    return embeddings.squeeze()

**ORF embedding**

In [42]:
metadata_df_unique_ORF.loc[:, "SEQUENCE"] = metadata_df_unique_ORF.SEQUENCE.str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_df_unique_ORF.loc[:, "SEQUENCE"] = metadata_df_unique_ORF.SEQUENCE.str.upper()


In [43]:
model, tokenizer = initialize_hyena(type="32k")

Using device: cuda
Loaded pretrained weights ok!


In [53]:
ORF_embeddings = []

for ORF_id in metadata_df_unique_ORF.index:
    hyena_orf = encode_list(model, 
                      tokenizer,
                      metadata_df_unique_ORF.loc[ORF_id, "SEQUENCE"]).detach().cpu().mean(0)
    
    gene_ORF = metadata_df_unique_ORF.loc[ORF_id, "GENE"]
    if gene_ORF == "none":
        gene2vec_ORF = np.zeros(200)
    else:
        gene2vec_ORF = gene2vec_df.loc[gene_ORF]
    features_ORF = np.concatenate([hyena_orf, gene2vec_ORF])
    ORF_embeddings.append(features_ORF)

ORF_embeddings = pd.DataFrame(np.stack(ORF_embeddings, axis=0))
ORF_embeddings.index = metadata_df_unique_ORF.index

In [54]:
ORF_embeddings

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,446,447,448,449,450,451,452,453,454,455
BROAD_SAMPLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ccsbBroad304_14741,-0.409847,0.626718,-0.757473,0.566443,0.145741,0.470141,0.549721,0.826112,-0.773768,-0.282782,...,0.002295,-0.226867,-0.268226,0.214683,-0.397653,0.197793,-0.176646,0.037244,-0.238404,-0.444907
ccsbBroad304_01144,-0.394258,0.652808,-0.720505,0.590082,0.129458,0.449604,0.504656,0.692502,-0.790557,-0.327518,...,0.111785,-0.202702,-0.019143,0.030797,-0.058750,0.075269,-0.370653,-0.143320,0.146371,-0.051335
ccsbBroad304_06701,-0.418534,0.621285,-0.786578,0.591756,0.183682,0.507071,0.529287,0.827070,-0.772418,-0.250578,...,0.212313,-0.133364,-0.107397,0.177778,-0.084069,0.171335,-0.145187,0.101690,-0.089437,-0.043603
ccsbBroad304_14770,-0.477444,0.605434,-0.838878,0.536808,0.158124,0.463039,0.537290,0.853152,-0.739782,-0.208564,...,0.038190,-0.304845,0.435419,-0.234603,-0.113952,0.274763,-0.167308,-0.287036,0.215947,0.237109
ccsbBroad304_14679,-0.423706,0.663740,-0.743601,0.607097,0.177317,0.515377,0.567037,0.746573,-0.772847,-0.325948,...,-0.015705,-0.231644,0.310512,-0.055141,0.055737,0.242800,-0.018448,-0.077266,0.111835,-0.059999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ccsbBroad304_14931,-0.529682,0.599548,-0.886362,0.484845,0.116349,0.420778,0.522038,0.829576,-0.723932,-0.138740,...,0.126494,0.073889,0.287814,0.488328,0.115950,0.114169,-0.020160,-0.483957,0.008925,0.291167
ccsbBroad304_01690,-0.466014,0.576979,-0.773365,0.615413,0.144066,0.503766,0.457032,0.878075,-0.839191,-0.274336,...,0.066687,0.400102,-0.123906,-0.128009,-0.203811,-0.000890,-0.084563,-0.258661,0.122599,-0.226970
ccsbBroad304_14525,-0.491326,0.608165,-0.905336,0.542307,0.193194,0.479573,0.579705,0.833302,-0.736084,-0.280498,...,0.009485,-0.441477,0.194611,0.110539,-0.146684,-0.112536,-0.147158,0.097409,0.117752,-0.077280
ccsbBroad304_06451,-0.503765,0.576465,-0.893108,0.541449,0.149166,0.474666,0.535809,0.882125,-0.728006,-0.211793,...,-0.004658,-0.099958,-0.046416,0.279565,0.015143,-0.081547,-0.055542,-0.185902,0.109918,-0.061697


In [70]:
ORF_embeddings.to_csv("/home/icb/alessandro.palma/environment/IMPA/IMPA/embeddings/cpg0000/orf_embeddings.csv")

**CRISPR embedding**

In [57]:
model, tokenizer = initialize_hyena(type="1k")

Using device: cuda
Loaded pretrained weights ok!


In [62]:
CRISPR_embeddings = []

for CRISPR_id in metadata_df_unique_CRISPR.index:
    hyena_crispr = encode_list(model, 
                      tokenizer,
                      metadata_df_unique_CRISPR.loc[CRISPR_id, "SEQUENCE"]).detach().cpu().mean(0)
    
    gene_CRISPR = metadata_df_unique_CRISPR.loc[CRISPR_id, "GENE"]
    if gene_CRISPR == "none":
        gene2vec_CRISPR = np.zeros(200)
    else:
        gene2vec_CRISPR = gene2vec_df.loc[gene_CRISPR]
    features_CRISPR = np.concatenate([hyena_crispr, gene2vec_CRISPR])
    CRISPR_embeddings.append(features_CRISPR)

CRISPR_embeddings = pd.DataFrame(np.stack(CRISPR_embeddings, axis=0))
CRISPR_embeddings.index = metadata_df_unique_CRISPR.index

In [67]:
CRISPR_embeddings

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,318,319,320,321,322,323,324,325,326,327
BROAD_SAMPLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRDN0001487609,-0.487427,0.347149,-0.524059,4.814445,0.191758,-1.563273,0.485494,-0.215841,-0.267696,0.192934,...,0.137273,0.401761,0.371019,-0.106169,0.240476,-0.261226,0.143756,0.068434,-0.329078,-0.022664
BRDN0001479298,-0.638147,0.300631,-0.440361,5.088506,0.294199,-1.714094,0.379803,-0.261289,-0.272920,0.135240,...,0.111693,0.220345,-0.006290,-0.209172,-0.175556,-0.389284,0.053190,-0.142584,-0.035096,0.024887
BRDN0001479710,-0.581903,0.385855,-0.524179,5.186078,0.230934,-1.820913,0.168544,-0.289232,-0.270877,0.367435,...,0.171553,0.237880,0.016805,-0.024685,-0.095572,0.060733,-0.135605,0.007787,-0.115866,0.000298
BRDN0000733096,-0.631087,0.315380,-0.628779,4.864506,0.188502,-1.641159,0.366181,-0.282806,-0.385409,0.150085,...,0.145678,-0.351808,-0.059113,0.155793,-0.158494,-0.001987,-0.131213,-0.205807,-0.002758,-0.122367
BRDN0001482390,-0.426546,0.398020,-0.238600,4.672925,0.099462,-1.537717,0.358292,-0.326800,-0.317919,0.321970,...,-0.027862,0.471959,-0.182183,0.369106,0.460475,0.263095,-0.050838,-0.370975,-0.120371,0.145749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRDN0001488073,-0.585653,0.367644,-0.540410,5.130541,0.179825,-1.676436,0.514168,-0.286172,-0.275858,0.168944,...,-0.196868,-0.116677,-0.244362,0.101826,-0.484365,-0.097732,-0.084960,0.192849,0.118961,-0.288315
BRDN0001480034,-0.519869,0.406055,-0.409904,4.977608,0.236424,-1.811684,0.362824,-0.281558,-0.135067,0.235303,...,-0.225452,0.122836,-0.109937,0.103374,-0.086600,-0.025172,-0.020502,-0.029701,0.087194,-0.023438
BRDN0001144995,-0.572469,0.332737,-0.469059,5.157083,0.251972,-1.739081,0.235031,-0.226824,-0.267061,0.180597,...,0.038190,-0.304845,0.435419,-0.234603,-0.113952,0.274763,-0.167308,-0.287036,0.215947,0.237109
BRDN0001485421,-0.488424,0.368291,-0.338103,4.933823,0.244212,-1.630669,0.462963,-0.279276,-0.115639,0.368485,...,0.227687,-0.061494,0.213924,0.353399,0.210409,-0.045654,0.082137,-0.126077,0.027432,0.308345


In [71]:
CRISPR_embeddings.to_csv("/home/icb/alessandro.palma/environment/IMPA/IMPA/embeddings/cpg0000/crispr_embeddings.csv")


In [73]:
pd.read_csv("/home/icb/alessandro.palma/environment/IMPA/IMPA/embeddings/cpg0000/crispr_embeddings.csv", index_col=0)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,318,319,320,321,322,323,324,325,326,327
BROAD_SAMPLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRDN0001487609,-0.487427,0.347149,-0.524059,4.814445,0.191758,-1.563273,0.485494,-0.215841,-0.267696,0.192934,...,0.137273,0.401761,0.371019,-0.106169,0.240476,-0.261226,0.143756,0.068434,-0.329078,-0.022664
BRDN0001479298,-0.638147,0.300631,-0.440361,5.088506,0.294199,-1.714094,0.379803,-0.261289,-0.272920,0.135240,...,0.111693,0.220345,-0.006290,-0.209172,-0.175556,-0.389284,0.053190,-0.142584,-0.035096,0.024887
BRDN0001479710,-0.581903,0.385855,-0.524179,5.186078,0.230934,-1.820913,0.168544,-0.289232,-0.270877,0.367435,...,0.171553,0.237880,0.016805,-0.024685,-0.095572,0.060733,-0.135605,0.007787,-0.115866,0.000298
BRDN0000733096,-0.631087,0.315380,-0.628779,4.864506,0.188502,-1.641159,0.366181,-0.282806,-0.385409,0.150085,...,0.145678,-0.351808,-0.059113,0.155793,-0.158494,-0.001987,-0.131213,-0.205807,-0.002758,-0.122367
BRDN0001482390,-0.426546,0.398020,-0.238600,4.672925,0.099462,-1.537717,0.358292,-0.326800,-0.317919,0.321970,...,-0.027862,0.471959,-0.182183,0.369106,0.460475,0.263095,-0.050838,-0.370975,-0.120371,0.145749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRDN0001488073,-0.585653,0.367644,-0.540410,5.130541,0.179825,-1.676436,0.514168,-0.286172,-0.275858,0.168944,...,-0.196868,-0.116677,-0.244362,0.101826,-0.484365,-0.097732,-0.084960,0.192849,0.118961,-0.288315
BRDN0001480034,-0.519869,0.406055,-0.409904,4.977608,0.236424,-1.811684,0.362824,-0.281558,-0.135067,0.235303,...,-0.225452,0.122836,-0.109937,0.103374,-0.086600,-0.025172,-0.020502,-0.029701,0.087194,-0.023438
BRDN0001144995,-0.572469,0.332737,-0.469059,5.157083,0.251972,-1.739081,0.235031,-0.226824,-0.267061,0.180597,...,0.038190,-0.304845,0.435419,-0.234603,-0.113952,0.274763,-0.167308,-0.287036,0.215947,0.237109
BRDN0001485421,-0.488424,0.368291,-0.338103,4.933823,0.244212,-1.630669,0.462963,-0.279276,-0.115639,0.368485,...,0.227687,-0.061494,0.213924,0.353399,0.210409,-0.045654,0.082137,-0.126077,0.027432,0.308345
