### Preparation

In [1]:
!wget -c https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-62/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz

--2025-11-16 03:13:19--  https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-62/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz
正在解析主机 ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)... 193.62.193.161
正在连接 ftp.ensemblgenomes.ebi.ac.uk (ftp.ensemblgenomes.ebi.ac.uk)|193.62.193.161|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 416 Requested Range Not Satisfiable

    文件已下载完成；不会进行任何操作。



In [2]:
from pyfastx import Fasta

genome = Fasta("Arabidopsis_thaliana.TAIR10.cds.all.fa.gz")
with open("ath_cds.csv", "w") as f:
    print("seq_id,sequence", file=f)
    for seq in genome:
        print(f"{seq.name},{seq.seq}", file=f)

In [3]:
import copy
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer

In [4]:
# Load the datasets
data_path = "ath_cds.csv"
datasets = DNADataset.load_local_data(data_path, seq_col="sequence", sep=",")

# Sampling the datasets
datasets.sampling(0.1, seed=42, overwrite=True)
datasets.split_data(seed=42)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
seq = datasets.dataset["test"][10]["sequence"]
prompt = seq[:10]
print("Length:", len(seq))
print("Prompt sequence:", prompt)
print("Full sequence:  ", seq)

Length: 207
Prompt sequence: ATGACTTGCA
Full sequence:   ATGACTTGCACGACAGAGATAGATATTTTGAAGTGGACAGTGAGGTATTGTTCGAGTTTAGCTGCACACCTTCTAACTCCTACGAGATTGTTCAAATATGAAATTCAACAACAGAGCGATTTGAGAAATGCAACTGAAAACAAAACTGAAAAATATATTTCTGACGACGTCGGTCATTGTAGACATACATACATGCAAATCAGATAA


### DNAGPT

In [6]:
# Load the config file
configs = load_config("./finetune_config.yaml")
configs["finetune"].output_dir = "./outputs_dnagpt"

In [7]:
# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-singlebase"
# from Hugging Face
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
tokenizer.model_max_length = 2048

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-singlebase
03:13:28 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-singlebase


In [8]:
# Encode the datasets
data = copy.deepcopy(datasets)
data.encode_sequences(tokenizer=tokenizer)

Encoding inputs:   0%|          | 0/3382 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/966 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/484 [00:00<?, ? examples/s]

In [9]:
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=data
)

In [10]:
# Start training
metrics = trainer.train()
print(metrics)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,1.2891,1.278866
400,1.2723,1.276379


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 175.6958, 'train_samples_per_second': 38.498, 'train_steps_per_second': 2.413, 'total_flos': 1767379304448000.0, 'train_loss': 1.280015787988339, 'epoch': 2.0}


In [11]:
model.eval()

tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=len(seq)+5, num_return_sequences=5, do_sample=True, top_k=50, top_p=0.95, temperature=1.0)

Setting `pad_token_id` to `eos_token_id`:9 for open-end generation.


In [12]:
print("Prompt:               ", prompt)
for i, out in enumerate(outputs):
    out_seq = tokenizer.decode(out, skip_special_tokens=True)
    print(f"Generated sequence {i}: ", out_seq.replace(" ", ""))
print("Raw sequence:         ", seq)

Prompt:                ATGACTTGCA
Generated sequence 0:  ATGACTTGCAGGAGCAAAATCTTCAAATAGAAGTAAGCCATTGCTGGATCCTTGAAACCGGAAGCTATACTTCTATACCTTTTTTGCAGAACAGAGGCCACACCGTTGATCATGTGTACTCCAGGCCAAGTTTTATAGGCATTGGGCCTCAAAACAAAGACTTGGGTTCAGCCACGGTTTGGGGCTTTGCTAGAGCAAAAGAGCTATTCT
Generated sequence 1:  ATGACTTGCACAGAAGCTTTCAACAAAGAAGAAAACGCTTCTCTTCGTACAAGGATGCTTCAGCTGCAATTGCTACTAATCCTCCTGTTCTTGCTGCTGTCTACTCCATTGGAGACTCGGAATTCTCCAGTAGCGATCAGGATCTTCGTTGCCTTGAAAGCTTTTGTTGCTCAGAGACTAAAGCTTCGGAGTTTTCCTATAGGTCTAAGT
Generated sequence 2:  ATGACTTGCAGAACATCAAGATCGTGATGCAGATTGCATTTTGTAGCAACAGAACAGCAAATGTGGAAACAGCTATCATGTCGGTAATCCACGAATGGCATGACGAAATCCAATTGATAACATCAAAAGTCGCTGCTAACGCCGAAAATATCCGTAACGTTACACACCGGATGTATCTCTTCATAGAAATCTTTCAGTTGTCCACCAAGC
Generated sequence 3:  ATGACTTGCAGGCGACGCCGTTGGATACTTTATGGGGGGTGTTTTCAGAGTTGGATAATATGGAGAGACACGTGACCACCATATATACAAGACTGAGCACTTCCGATGGCGGTATCAGGAAACAGATGTCGGCGCAATTATACTTGAGAATTGTCGAAGCAGGAATCGTGTGGGCTCCTAACCAACCTCTACACCACCTAACCGAAACAG
Generated sequence 4:  ATGACTT

### MegaDNA

In [13]:
# Load the config file
configs = load_config("./finetune_config.yaml")
configs["task"].task_type = "embedding"
configs["finetune"].output_dir = "./outputs_megadna"

In [None]:
# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
tokenizer.model_max_length = 2048

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/megaDNA_updated
03:16:39 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/megaDNA_updated


In [15]:
# Encode the datasets
data = copy.deepcopy(datasets)
data.encode_sequences(tokenizer=tokenizer)

Encoding inputs:   0%|          | 0/3382 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/966 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/484 [00:00<?, ? examples/s]

In [16]:
# Specific processing for MEGA-DNA
data.dataset = data.dataset.remove_columns(["seq_id", "sequence", "token_type_ids", "attention_mask"])
data.dataset = data.dataset.rename_column("input_ids", "ids")
data.dataset

DatasetDict({
    train: Dataset({
        features: ['ids'],
        num_rows: 3382
    })
    test: Dataset({
        features: ['ids'],
        num_rows: 966
    })
    val: Dataset({
        features: ['ids'],
        num_rows: 484
    })
})

In [17]:
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=data
)

In [18]:
# Define a custom trainer for MEGA-DNA
class MegaDNATrainer(type(trainer.trainer)):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        loss = model(**inputs, return_value = "loss")
        if return_outputs:
            logits = model(**inputs, return_value = "logits")
            return (loss, logits)
        
        return loss

trainer.customize_trainer(MegaDNATrainer)
trainer.trainer.can_return_loss = True

In [19]:
# Start training
metrics = trainer.train()
print(metrics)

  self.gen = func(*args, **kwds)


Step,Training Loss,Validation Loss
200,1.3221,1.306477
400,1.305,1.30198


  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)


{'train_runtime': 51.8054, 'train_samples_per_second': 130.565, 'train_steps_per_second': 8.184, 'total_flos': 0.0, 'train_loss': 1.3128084911490387, 'epoch': 2.0}


In [20]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = [model.generate(inputs["input_ids"], seq_len=len(seq)+5, temperature=0.95, filter_thres=0.0) for _ in range(5)]

  0%|          | 0/202 [00:00<?, ?it/s]

  self.gen = func(*args, **kwds)


  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

In [21]:
print("Prompt:               ", prompt)
for i, out in enumerate(outputs):
    out_seq = tokenizer.decode(out[0], skip_special_tokens=True)
    print(f"Generated sequence {i}: ", out_seq.replace(" ", ""))
print("Raw sequence:         ", seq)

Prompt:                ATGACTTGCA
Generated sequence 0:  ATGACTTGCATGGCATCGAGCAATCACGAGTGCTCGAGTAGTTGGTGGCAGTCAGCCCATAGTGGATGCTCCACTAGTCTTGGGTTGACCTCCTCTGATTGGAAGTCTATGATTGTTGGACCATCCCCGTTTGGATCCCCATCTCTGGCTGGCTTTAGTACTAACTGGATCACTAGGACTCCTAATCATTCATCAGGTCTCGGGACCTGTGC
Generated sequence 1:  ATGACTTGCAAAAGGAGAGTATTTCTTGGCTGCCTCTCTGCCGAACCAAACATTCAAGAACCTCCCGAAATTGCTCGTGAAACTGTAACGCTCGGTATCAAAAACCCGAAATCAAGAAGGGAATATCTTACTCTCTACAAAAAACGAAGGGGAAAGATCTTTGTTCATCCGAGCGCTGATGTGCACATTATGGAACTCGAGATGGGTTTTCA
Generated sequence 2:  ATGACTTGCATGTTCTTCCATTCTTTCTCCTCACCTTGTCTTATCCGTAGCCCCCTGCTGCTTCAGGACTTTCGGTCTCTCCTGCTCTTTCTCCTGCTCCTGCTCTCTCTCACCGGGGATCTTCCCACATTTCTGACGCTGCCAGAAGTGGTGAAGCTGCTGGGCTTCCTCCCCTTCGTGGAGTTTCCTTTCTCCGCAGCCCGCCCATGTTG
Generated sequence 3:  ATGACTTGCATTTCCAGAGAAGACGAAATGCAAGCAATCCTCCACGAAGAGCGGGAAGAGATCAACGAGCTTCGCATTGAAGATGAAGAAGATGAAGGTGAACATGTTACCTCTTACAAGAAGAATGAATCGCTCACCACTCATGATGATCTGCTGGATATCGTTCTTGATGAGCTCAAGAAAGAGCGGATTGGTAATGAAGAAGCTGAGAT
Generated sequence 4: 