In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, BertConfig
from transformers.models.bert.configuration_bert import BertConfig

import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')


%run preprocess_utility.py

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [33]:
datafile_path = '../../datasets/task03-genomic-regions/Homo_sapiens.GRCh38.109.txt.gz'  
df = preprocess_home_sapiens_datafile(datafile_path)
df

Unnamed: 0,CHROM,START,END,TYPE,CLUSTER,SIZE,ROWID,y
0,7,116953541,116953691,first_exon,first_exon,150,0,0
1,12,54241755,54241905,first_exon,first_exon,150,1,0
2,20,38033948,38034098,first_exon,first_exon,150,2,0
3,15,82659560,82659710,first_exon,first_exon,150,3,0
4,9,131579497,131579647,first_exon,first_exon,150,4,0
...,...,...,...,...,...,...,...,...
33713,9,124692442,124692551,miRNA,smallRNA,109,33713,6
33714,9,128244721,128244830,miRNA,smallRNA,109,33714,6
33715,X,45746157,45746266,miRNA,smallRNA,109,33715,6
33716,X,45747015,45747124,miRNA,smallRNA,109,33716,6


In [34]:
df['CLUSTER'].value_counts()

pseudogene               4998
first_intron             4985
ncRNA_gene               4975
first_three_prime_UTR    4971
first_five_prime_UTR     4952
first_exon               4926
smallRNA                 3701
Name: CLUSTER, dtype: int64

In [38]:
df.loc[df['CLUSTER'] == 'pseudogene', 'SIZE'].mean()

333.17406962785117

In [39]:
df.loc[df['CLUSTER'] == 'first_intron', 'SIZE'].mean()

247.46258776328986

In [40]:
df.loc[df['CLUSTER'] == 'ncRNA_gene', 'SIZE'].mean()

388.17608040201003

In [41]:
df.loc[df['CLUSTER'] == 'first_three_prime_UTR', 'SIZE'].mean()

324.2804264735466

In [42]:
df.loc[df['CLUSTER'] == 'first_five_prime_UTR', 'SIZE'].mean()

235.85238287560583

In [43]:
df.loc[df['CLUSTER'] == 'first_exon', 'SIZE'].mean()

227.30024360535933

In [44]:
df.loc[df['CLUSTER'] == 'smallRNA', 'SIZE'].mean()

86.66414482572277

In [3]:

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config).to(device)

Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should prob

### Load Human Chrom Sequences from .fa File

In [4]:
fasta_file = "../genome.hg38rg.fa"
chrom_sequences = read_fasta(fasta_file)

In [5]:
def get_embedding(dna):

    inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"].to(device)

    with torch.no_grad():
        hidden_states = model(inputs)[0]  # Get model outputs
        last_hidden_state = hidden_states[0]  # Access the last hidden state directly

    # Extract the [CLS] embedding
    cls_embedding = last_hidden_state[0, :]  # Extract the [CLS] embedding

    return cls_embedding

In [6]:
def get_subsequence(chrom_name, start_pos, length):
    
    # print(chrom_name, start_pos, length)
    if chrom_name in chrom_sequences:
        sequence = chrom_sequences[chrom_name]
        subsequence = sequence[start_pos:start_pos + length]
        return subsequence
    else:
        raise ValueError(f"Chromosome '{chrom_name}' not found in the FASTA file.")

In [8]:
def append_rows_to_csv(csv_Filename, rows):
    with open(csv_Filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for row in rows:
            writer.writerow(row)

### Main Process

In [9]:
%%time

comp = {'A':1, 'C':2, 'G':3, 'T':4}

max_length=512

csv_Filename = './homo_sapiens_dnabert2_embedding.csv'
if os.path.exists(csv_Filename):
    os.remove(csv_Filename)


datafile_path = '../../datasets/task03-homo-sapiens/Homo_sapiens.GRCh38.109.txt.gz'  
df = preprocess_home_sapiens_datafile(datafile_path)

df.loc[df['SIZE'] > max_length, 'END'] = df['START'] + max_length
df.loc[df['SIZE'] > max_length, 'SIZE'] = max_length
df.drop(columns=['END','TYPE','CLUSTER'], inplace=True)
# df

rows=[]
for index, row in df.iterrows():      
    chrom=row['CHROM']
    pos_start=row['START']
    if pos_start<=1:
        pos_start=1
    rowid=row['ROWID']
    y=row['y']
    length = row['SIZE']
    
    subsequence = get_subsequence(chrom, pos_start, length)
    if 'N' in subsequence:
        print("The character 'N' is present in the string.")
        
    embedding = get_embedding(subsequence)
    # print(embedding.shape)

    # feature=np.array(embedding_df.iloc[64])
    rows.append(np.append(embedding.cpu().detach().numpy(),  [ rowid, y]))

    if ((index % 2000) ==0):
        append_rows_to_csv(csv_Filename, rows)
        rows=[]
        print (f"index = {index} completed")
        
append_rows_to_csv(csv_Filename, rows)

print(f"Create File: "+csv_Filename)
    

index = 0 completed
index = 1000 completed
index = 2000 completed
index = 3000 completed
index = 4000 completed
index = 5000 completed
index = 6000 completed
index = 7000 completed
index = 8000 completed
index = 9000 completed
index = 10000 completed
index = 11000 completed
index = 12000 completed
index = 13000 completed
index = 14000 completed
index = 15000 completed
index = 16000 completed
index = 17000 completed
index = 18000 completed
index = 19000 completed
index = 20000 completed
index = 21000 completed
index = 22000 completed
index = 23000 completed
index = 24000 completed
index = 25000 completed
index = 26000 completed
index = 27000 completed
index = 28000 completed
index = 29000 completed
index = 30000 completed
index = 31000 completed
index = 32000 completed
index = 33000 completed
Create File: ./homo_sapiens_dnabert2_embedding.csv
CPU times: user 5min 29s, sys: 1.19 s, total: 5min 30s
Wall time: 6min 7s


### Load CSV File

In [10]:
import pandas as pd

def load_embedding_file(csv_filename):

    df=pd.read_csv(csv_filename)
        
    column_names = [f'{i}' for i in range(1, df.shape[1]-1)]
    column_names.extend(['ROWID',  'y']) 
    
    df.columns = column_names
    return df

df = load_embedding_file('./homo_sapiens_dnabert2_embedding.csv')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,761,762,763,764,765,766,767,768,ROWID,y
0,-0.052749,0.097021,-0.040993,-0.346654,-0.228739,-0.028974,0.314350,-0.232598,-0.286229,-0.153878,...,-0.122888,0.098309,-0.241941,0.123506,0.112383,0.345726,-0.009172,-0.232611,1.0,0.0
1,0.041690,-0.149883,0.112252,-0.104266,0.043270,-0.174862,-0.019622,0.083466,0.092115,0.129391,...,-0.084495,-0.035526,-0.103098,-0.096645,0.282093,0.071169,0.225908,-0.232176,2.0,0.0
2,-0.223088,-0.252698,0.238166,-0.163469,-0.071502,0.305217,0.130825,-0.125792,0.002430,-0.027796,...,-0.222114,0.230668,-0.297031,-0.200689,-0.019629,0.158987,0.125832,0.101306,3.0,0.0
3,-0.246262,-0.045285,-0.143605,-0.057505,-0.053923,0.136253,-0.029290,-0.291272,-0.286636,-0.224322,...,0.013754,-0.042559,-0.304382,-0.128556,-0.014183,-0.054658,-0.004198,-0.140027,4.0,0.0
4,-0.239890,0.128498,0.239283,0.033655,-0.266111,-0.010146,-0.093944,0.070097,-0.157701,0.294032,...,-0.085098,0.080050,-0.031489,-0.122065,0.021016,0.050609,0.141866,0.200009,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33502,0.017066,-0.111009,0.022180,-0.261551,-0.271820,-0.162913,0.190877,-0.169575,0.006689,0.179100,...,-0.030896,-0.036130,-0.062570,-0.123316,0.205648,0.112838,0.123704,0.236961,33713.0,6.0
33503,-0.121790,-0.037089,0.006528,-0.118168,-0.280798,-0.099464,0.028415,0.005333,-0.223928,-0.348742,...,-0.287694,0.021488,-0.167536,0.144424,0.048377,0.103088,0.146799,0.023316,33714.0,6.0
33504,-0.048092,-0.305211,-0.069179,-0.200773,-0.300104,-0.013643,0.447924,-0.122317,-0.286923,-0.055943,...,0.035547,-0.012060,-0.154126,0.243896,0.189965,0.304383,0.006802,-0.142035,33715.0,6.0
33505,-0.184854,-0.242769,0.011729,-0.293413,-0.274668,-0.121231,0.203303,-0.101680,0.109956,0.164296,...,-0.041231,-0.229076,-0.249888,-0.023293,0.185503,0.099200,-0.165828,-0.165640,33716.0,6.0
