In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoModel #, AutoModelForMaskedLM
from tqdm import tqdm

# import pickle
# import re
import os
import csv
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta

%run hyena_utility.py

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:

# def Cur_Inference(model, tokenizer, max_length, device, sequence):
def Cur_Inference(model, sequence):  # model, tokenizer, max_length, device, 

    '''
    this selects which backbone to use, and grabs weights/ config from HF
    4 options:
      'hyenadna-tiny-1k-seqlen'   # fine-tune on colab ok
      'hyenadna-small-32k-seqlen'
      'hyenadna-medium-160k-seqlen'  # inference only on colab
      'hyenadna-medium-450k-seqlen'  # inference only on colab
      'hyenadna-large-1m-seqlen'  # inference only on colab
    '''

    # you only need to select which model to use here, we'll do the rest!
    # pretrained_model_name = 'hyenadna-small-32k-seqlen'

    # max_lengths = {
    #     'hyenadna-tiny-1k-seqlen': 1024,
    #     'hyenadna-small-32k-seqlen': 32768,
    #     'hyenadna-medium-160k-seqlen': 160000,
    #     'hyenadna-medium-450k-seqlen': 450000,  # T4 up to here
    #     'hyenadna-large-1m-seqlen': 1_000_000,  # only A100 (paid tier)
    # }


    #### Single embedding example ####

    # create a sample 450k long, prepare
    # sequence = 'ACTG' * int(max_length/4)
    tok_seq = tokenizer(sequence)
    tok_seq = tok_seq["input_ids"]  # grab ids

    # place on device, convert to tensor
    tok_seq = torch.LongTensor(tok_seq).unsqueeze(0)  # unsqueeze for batch dim
    tok_seq = tok_seq.to(device)

    # prep model and forward
    # model.to(device)
    # model.eval()
    with torch.inference_mode():
        embeddings = model(tok_seq)

    # cls_embedding = embeddings.last_hidden_state[:, 0, :]
    # cls_embedding = embeddings[:, 0, :]
    
    mean_embeddings = embeddings.mean(dim=1) # Mean across the sequence length dimension
    mean_embeddings = mean_embeddings.squeeze(0)  # This will change the shape to [256]

    
    # print(embeddings.shape)  # embeddings here!
    # return cls_embedding
    return mean_embeddings


In [None]:
# def Subsequence2Embedding(model, tokenizer, max_length, device, subsequence):
def Subsequence2Embedding(subsequence):
    # embeddings = My_Inference(model, tokenizer, max_length, device, subsequence)
    embeddings = Cur_Inference(model, subsequence)
    # print(embeddings.shape)
    # return embeddings[0,int(max_length/2),:]
    return embeddings  # embeddings[0,0,:]

# max_length

In [None]:
def append_rows_to_csv(csv_Filename, rows):
    with open(csv_Filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        # writer.writerow(rows)
        for row in rows:
            writer.writerow(row)

### Main Process

In [None]:
pretrained_model_name = 'hyenadna-small-32k-seqlen'
pretrained_model_name = 'hyenadna-medium-160k-seqlen'
# pretrained_model_name = 'hyenadna-medium-450k-seqlen'
# pretrained_model_name = 'hyenadna-large-1m-seqlen'
model, tokenizer, max_length =  get_model_tokenizer_maxlen(pretrained_model_name)
model.to(device)
model.eval()

## Load dna segment data file 

In [None]:
import pandas as pd

# pathogenecity_type='noncoding'
pathogenecity_type='missense'

df=pd.read_csv('dna_segment_'+pathogenecity_type+'.csv')
df

In [None]:
%%time

now = datetime.now()
formatted_time = now.strftime("%y-%m-%d-%H-%M-%S")
csv_filename = '/home/sunhuaikuan/ondemand/blue_papers/DNA_LLM_REVIEW/preprocess/pathogenecity/pathogenecity_hyena_'+pathogenecity_type+'_'+formatted_time+'.csv'


rows=[]
for index, row in df.iterrows():      
        
    y=row['y']

    subsequence = row['sequence']
    if 'N' in subsequence:
        print("The character 'N' is present in the string.")
        
    embedding = Subsequence2Embedding(subsequence)
    # print(embedding.shape)

    # feature=np.array(embedding_df.iloc[64])
    rows.append(np.append(embedding.cpu().numpy(),  [ y]))


    if index > 0 and (index % 2000) == 0:
        append_rows_to_csv(csv_filename, rows)
        rows=[]
        print (f"index = {index} completed")
        
append_rows_to_csv(csv_filename, rows)

print(f"Create File: "+csv_filename)

### Load CSV File

In [None]:
import pandas as pd

def load_embedding_file(csv_filename):

    df=pd.read_csv(csv_filename)
    
    column_names = [f'{i}' for i in range(1, df.shape[1])]
    column_names.extend([ 'y'])
    
    df.columns = column_names
    return df
    

df = load_embedding_file(csv_filename)
df