In [8]:
# !pip install genomic_benchmarks
# !pip install omegaconf

In [1]:
# using Kernal PyTorch-2.0.1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoModel #, AutoModelForMaskedLM

# import pickle
from tqdm import tqdm

# import pickle
# import re
import os
import csv
import warnings
warnings.filterwarnings('ignore')


%run hyena_utility.py

%run preprocess_utility.py

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

### Load Human Chrom Sequences from .fa File

In [2]:
fasta_file = "../genome.hg38rg.fa"
chrom_sequences = read_fasta(fasta_file)

In [3]:
def Subsequence2Embedding(subsequence):
    tok_seq = tokenizer(subsequence)
    tok_seq = tok_seq["input_ids"]  

    # place on device, convert to tensor
    tok_seq = torch.LongTensor(tok_seq).unsqueeze(0)  # unsqueeze for batch dim
    tok_seq = tok_seq.to(device)

    with torch.inference_mode():
        embeddings = model(tok_seq)

    mean_embeddings = embeddings.mean(dim=1) # Mean across the sequence length dimension
    mean_embeddings = mean_embeddings.squeeze(0)  # This will change the shape to [256]
    
    return mean_embeddings

In [4]:
def append_rows_to_csv(csv_Filename, rows):
    with open(csv_Filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for row in rows:
            writer.writerow(row)

### Main Process

In [5]:
pretrained_model_name = 'hyenadna-medium-160k-seqlen'
model, tokenizer, max_length =  get_model_tokenizer_maxlen(pretrained_model_name)
model.to(device)
model.eval()

Using device: cuda
Git LFS initialized.


Cloning into 'hyenadna-medium-160k-seqlen'...


Loaded pretrained weights ok!


HyenaDNAModel(
  (backbone): LMBackbone(
    (embeddings): GPT2Embeddings(
      (word_embeddings): Embedding(16, 256)
    )
    (layers): ModuleList(
      (0): Block(
        (mixer): HyenaOperator(
          (dropout): Dropout(p=0.0, inplace=False)
          (in_proj): Linear(in_features=256, out_features=768, bias=True)
          (out_proj): Linear(in_features=256, out_features=256, bias=True)
          (short_filter): Conv1d(768, 768, kernel_size=(3,), stride=(1,), padding=(2,), groups=768)
          (filter_fn): HyenaFilter(
            (dropout): Dropout(p=0.0, inplace=False)
            (pos_emb): PositionalEmbedding()
            (implicit_filter): Sequential(
              (0): Linear(in_features=5, out_features=64, bias=True)
              (1): Sin()
              (2): Linear(in_features=64, out_features=64, bias=True)
              (3): Sin()
              (4): Linear(in_features=64, out_features=64, bias=True)
              (5): Sin()
              (6): Linear(in_features=

In [6]:
def get_subsequence(chrom_name, start_pos, length):
    
    if chrom_name in chrom_sequences:
        sequence = chrom_sequences[chrom_name]
        subsequence = sequence[start_pos:start_pos + length]
        return subsequence
    else:
        raise ValueError(f"Chromosome '{chrom_name}' not found in the FASTA file.")

In [7]:
%%time

comp = {'A':1, 'C':2, 'G':3, 'T':4}

max_length = 128

csv_Filename = './homo_sapiens_hyena_embedding.csv'
if os.path.exists(csv_Filename):
    os.remove(csv_Filename)

datafile_path = '../../datasets/task03-genomic-regions/Homo_sapiens.GRCh38.109.txt.gz'  
df = preprocess_home_sapiens_datafile(datafile_path)


df.loc[df['SIZE'] > max_length, 'END'] = df['START'] + max_length
df.loc[df['SIZE'] > max_length, 'SIZE'] = max_length
df.drop(columns=['END','TYPE','CLUSTER'], inplace=True)
# df


rows=[]
for index, row in df.iterrows():      
    chrom=row['CHROM']
    rowid=row['ROWID']
    pos_start=row['START']

    if pos_start<=1:
        pos_start=1
    y=row['y']
    length = row['SIZE'] # max_length
    
    subsequence = get_subsequence(chrom, pos_start, length)
    if 'N' in subsequence:
        print("The character 'N' is present in the string.")
        
    embedding = Subsequence2Embedding(subsequence)
    # print (embedding)

    # feature=np.array(embedding_df.iloc[64])
    rows.append(np.append(embedding.cpu().numpy(),  [rowid, y]))  #chrom, length,  


    if index > 0 and (index % 2000) == 0:
        append_rows_to_csv(csv_Filename, rows)
        rows=[]
        print (f"index = {index} completed")
        
append_rows_to_csv(csv_Filename, rows)

print(f"Create File: "+csv_Filename)

index = 2000 completed
index = 4000 completed
index = 6000 completed
index = 8000 completed
index = 10000 completed
index = 12000 completed
index = 14000 completed
index = 16000 completed
index = 18000 completed
index = 20000 completed
index = 22000 completed
index = 24000 completed
index = 26000 completed
index = 28000 completed
index = 30000 completed
index = 32000 completed
Create File: ./homo_sapiens_hyena_embedding.csv
CPU times: user 4min 23s, sys: 742 ms, total: 4min 23s
Wall time: 4min 37s


### Load CSV File

In [8]:
import pandas as pd

def load_embedding_file(csv_filename):

    df=pd.read_csv(csv_filename)
    
    column_names = [f'{i}' for i in range(1, df.shape[1]-1)]
    column_names.extend(['ROWID', 'y'])
    
    df.columns = column_names
    return df
df = load_embedding_file('./homo_sapiens_hyena_embedding.csv')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,249,250,251,252,253,254,255,256,ROWID,y
0,-0.098259,-0.778371,-0.567632,-0.864359,-0.912805,-0.907104,1.634788,-0.941197,0.169670,0.114994,...,0.455128,-0.650662,-1.605736,-1.492585,1.182727,0.519747,0.673476,-1.080819,1.0,0.0
1,-0.225349,-0.772242,-0.580955,-1.056253,-1.319412,-0.937095,1.204180,-0.703993,-0.227651,0.182004,...,0.496497,-1.000214,-1.584138,-1.652526,1.257367,0.468274,0.278431,-0.962163,2.0,0.0
2,0.000942,-0.598621,-0.672390,-0.641442,-1.040335,-0.755668,1.274980,-0.978171,-0.042830,0.124320,...,0.494047,-0.849385,-1.569252,-1.409305,1.183825,0.358066,0.373570,-1.182730,3.0,0.0
3,-0.120756,-0.799536,-0.620240,-0.805628,-1.006669,-0.907013,1.548830,-0.936683,0.091012,0.218409,...,0.447258,-0.846772,-1.625529,-1.477122,1.251898,0.502077,0.407137,-1.082190,4.0,0.0
4,-0.087019,-0.803882,-0.531570,-0.888757,-1.351001,-0.758447,1.058772,-0.702686,-0.500163,0.162552,...,0.480105,-1.057028,-1.528152,-1.568092,1.297123,0.465277,0.306736,-0.968242,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33502,-0.144893,-0.688863,-0.502946,-0.884095,-0.949489,-0.980342,1.229003,-0.794743,-0.065076,0.398823,...,0.456091,-0.921927,-1.539195,-1.564770,1.214185,0.504790,0.358191,-0.886767,33713.0,6.0
33503,0.148355,-0.644058,-0.473307,-0.542517,-0.696149,-0.696782,1.290697,-0.928129,-0.102882,0.279975,...,0.486481,-0.966628,-1.423398,-1.251273,1.157585,0.344522,0.594526,-1.020150,33714.0,6.0
33504,-0.148756,-0.699471,-0.448784,-0.855049,-0.818939,-0.974509,1.198471,-0.822832,0.027991,0.465258,...,0.452030,-0.963613,-1.493534,-1.490965,1.182805,0.537592,0.380066,-0.932183,33715.0,6.0
33505,-0.018792,-0.577633,-0.492049,-0.767255,-0.861196,-0.903652,1.238136,-0.859306,0.040800,0.341311,...,0.494729,-0.926624,-1.494075,-1.484290,1.192897,0.479127,0.373609,-0.984641,33716.0,6.0
