# DNABERT-2 Fine-Tuning (Transformer Baseline)

## Environment Setup

In [2]:
!pip install -q transformers datasets accelerate

## Load HG38 Dataset

In [3]:
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/bioproj01"
DATA_DIR = os.path.join(PROJECT_DIR, "data")
print(f"Data directory found: {PROJECT_DIR}\nContents: {os.listdir(PROJECT_DIR)}")

Mounted at /content/drive
Data directory found: /content/drive/MyDrive/bioproj01
Contents: ['data', 'results']


In [4]:
import pandas as pd

In [5]:
hg38_df = pd.read_csv(
    f"{DATA_DIR}/hg38/human_promoter_vs_nonpromoter_10k_400bp.csv"
)

hg38_df = hg38_df[["sequence", "label"]]
hg38_df.head()

Unnamed: 0,sequence,label
0,TGAACCCCGGGAGGCAAGGGCTGCCATGGCAGGGGTGGGGTTTCAT...,0
1,GGCCCAGCTCTGACGCCAGGCTGTCTTGCCTCTGCTCACCTGCAGC...,1
2,TCATGCCTGGCCAGCAAAATTGTTTTTTAAAAGTTTATGCTACTAA...,1
3,AAGTTAAATAAATCAGGGTTTTCACCTGGTTCTTTAAGATCTGTTG...,0
4,AATGGAAGAAGCCAAAATTTTGCAGAACAAGAGAATATGCAAGAGA...,0


## Train / Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_df, test_df = train_test_split(
    hg38_df,
    test_size=0.2,
    stratify=hg38_df["label"],
    random_state=42
)

len(train_df), len(test_df)

(15995, 3999)

## Load DNABERT-2 Tokenizer & Model

In [8]:
!nvidia-smi

Tue Jan 13 13:46:55 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [11]:
MODEL_NAME = "quietflamingo/dnabert2-no-flashattention"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

bert_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    num_labels=2,
    attn_implementation="eager"
)

tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


bert_layers.py: 0.00B [00:00, ?B/s]

bert_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/quietflamingo/dnabert2-no-flashattention:
- bert_layers.py
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at quietflamingo/dnabert2-no-flashattention and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Convert to HuggingFace Datasets

In [12]:
from datasets import Dataset

In [13]:
# Tokenization Strategy
def tokenize(batch):
    return tokenizer(
        batch["sequence"],
        truncation=True,
        padding="max_length",
        max_length=400
    )

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["sequence"])
test_ds  = test_ds.remove_columns(["sequence"])

train_ds.set_format("torch")
test_ds.set_format("torch")

Map:   0%|          | 0/15995 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/468M [00:00<?, ?B/s]

Map:   0%|          | 0/3999 [00:00<?, ? examples/s]

## Training Configuration

In [14]:
from transformers import TrainingArguments

In [15]:
training_args = TrainingArguments(
    output_dir="./dnabert2_promoter",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    fp16=True,
    dataloader_drop_last=True,
    optim="adamw_torch_fused",
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

## Train DNABERT-2

In [16]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, f1_score,
    roc_auc_score, matthews_corrcoef
)
from transformers import EarlyStoppingCallback, Trainer

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    logits = np.array(logits)
    labels = np.array(labels)
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
    preds = np.argmax(probs, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "roc_auc": roc_auc_score(labels, probs[:, 1]),
        "mcc": matthews_corrcoef(labels, preds),
    }

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc,Mcc
1,0.4577,0.407977,0.816633,0.806144,0.895754,0.636918
2,0.3302,0.521833,0.820641,0.813055,0.894751,0.643339
3,0.343,0.642532,0.817635,0.807814,0.885165,0.638531


TrainOutput(global_step=5997, training_loss=0.37573514247866935, metrics={'train_runtime': 416.4728, 'train_samples_per_second': 192.029, 'train_steps_per_second': 23.999, 'total_flos': 1.31173466792832e+16, 'train_loss': 0.37573514247866935, 'epoch': 3.0})

## Final Evaluation

In [18]:
results = trainer.evaluate()
results

{'eval_loss': 0.4079771041870117,
 'eval_accuracy': 0.8166332665330661,
 'eval_f1': 0.8061440677966102,
 'eval_roc_auc': 0.8957537810475112,
 'eval_mcc': 0.6369183864246266,
 'eval_runtime': 17.2695,
 'eval_samples_per_second': 231.565,
 'eval_steps_per_second': 28.953,
 'epoch': 3.0}

## Save Reports to Drive

### Create Output Directory

In [19]:
OUT_DIR = os.path.join(PROJECT_DIR, "results/dnabert2")
os.makedirs(OUT_DIR, exist_ok=True)

### Save Performance Metrics

In [24]:
bert_results = pd.DataFrame([{
    "model": "DNABERT2",
    "accuracy": results["eval_accuracy"],
    "f1": results["eval_f1"],
    "auroc": results["eval_roc_auc"],
    "mcc": results["eval_mcc"],
    "epochs": int(results["epoch"]),
    "pretrained_model": "dnabert2-no-flashattention",
    "sequence_length_bp": 400,
    "evaluation": "80/20 holdout"
}])

bert_results.to_csv(
    os.path.join(OUT_DIR, "dnabert2_performance.csv"),
    index=False
)

bert_results

Unnamed: 0,model,accuracy,f1,auroc,mcc,epochs,pretrained_model,sequence_length_bp,evaluation
0,DNABERT2,0.816633,0.806144,0.895754,0.636918,3,dnabert2-no-flashattention,400,80/20 holdout


### Save Trainer Logs

In [21]:
trainer_log = pd.DataFrame(trainer.state.log_history)
trainer_log.to_csv(
    os.path.join(OUT_DIR, "dnabert2_training_log.csv"),
    index=False
)

## Save ROC Probabilities

In [23]:
pred = trainer.predict(test_ds)

if isinstance(pred.predictions, tuple):
    logits = pred.predictions[0]
else:
    logits = pred.predictions

labels = pred.label_ids

exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

y_score = probs[:, 1]

roc_df = pd.DataFrame({
    "y_true": labels,
    "y_score": y_score
})

roc_df.to_csv(
    os.path.join(OUT_DIR, "dnabert2_roc_data.csv"),
    index=False
)

roc_df.head()

Unnamed: 0,y_true,y_score
0,1,0.977349
1,1,0.779825
2,1,0.102439
3,0,0.168378
4,0,0.580694
