# DNABERT-2 Fine-Tuning (Transformer Baseline)

## Environment Setup

In [1]:
!pip install -q transformers datasets accelerate

## Load HG38 Dataset

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Mounted at /content/drive
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data']


In [3]:
import pandas as pd

In [4]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGCATATTATTTTATATGCATCTATTTTGAATCTTCATAAATGTAA...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,TGCCTGGTTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCAT...,0
4,AATAATTGAAATAAGCTTAATAAATGGGCTCAAAAGAATGAAAGAG...,0


## Train / Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_df, test_df = train_test_split(
    hg38_df,
    test_size=0.2,
    stratify=hg38_df["label"],
    random_state=42
)

len(train_df), len(test_df)

(15995, 3999)

## Load DNABERT-2 Tokenizer & Model

In [8]:
!nvidia-smi

Tue Jan 13 02:02:24 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             45W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [10]:
MODEL_NAME = "quietflamingo/dnabert2-no-flashattention"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

bert_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    num_labels=2,
    attn_implementation="eager"
)

tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


bert_layers.py: 0.00B [00:00, ?B/s]

bert_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- bert_layers.py
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at quietflamingo/dnabert2-no-flashattention and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Convert to HuggingFace Datasets

In [11]:
from datasets import Dataset

In [12]:
# Tokenization Strategy
def tokenize(batch):
    return tokenizer(
        batch["sequence"],
        truncation=True,
        padding="max_length",
        max_length=400
    )

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["sequence"])
test_ds  = test_ds.remove_columns(["sequence"])

train_ds.set_format("torch")
test_ds.set_format("torch")

Map:   0%|          | 0/15995 [00:00<?, ? examples/s]

Map:   0%|          | 0/3999 [00:00<?, ? examples/s]

## Training Configuration

In [13]:
from transformers import TrainingArguments

In [14]:
training_args = TrainingArguments(
    output_dir="./dnabert2_promoter",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    fp16=True,
    dataloader_drop_last=True,
    optim="adamw_torch_fused",
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

## Train DNABERT-2

In [17]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, f1_score,
    roc_auc_score, matthews_corrcoef
)
from transformers import EarlyStoppingCallback, Trainer

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    logits = np.array(logits)
    labels = np.array(labels)
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
    preds = np.argmax(probs, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "roc_auc": roc_auc_score(labels, probs[:, 1]),
        "mcc": matthews_corrcoef(labels, preds),
    }

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc,Mcc
1,0.4392,0.408527,0.815882,0.798796,0.895686,0.640935
2,0.3302,0.498136,0.822645,0.804095,0.898158,0.657022
3,0.3132,0.67296,0.827906,0.820204,0.892483,0.658161


TrainOutput(global_step=5997, training_loss=0.35759634190009637, metrics={'train_runtime': 422.8258, 'train_samples_per_second': 189.144, 'train_steps_per_second': 23.639, 'total_flos': 1.31173466792832e+16, 'train_loss': 0.35759634190009637, 'epoch': 3.0})

## Final Evaluation

In [19]:
results = trainer.evaluate()
results

{'eval_loss': 0.40852653980255127,
 'eval_accuracy': 0.8158817635270541,
 'eval_f1': 0.7987955105392828,
 'eval_roc_auc': 0.8956857592152639,
 'eval_mcc': 0.6409352052533037,
 'eval_runtime': 17.5133,
 'eval_samples_per_second': 228.34,
 'eval_steps_per_second': 28.55,
 'epoch': 3.0}