In [1]:
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer

### Finetune with a custom classification head (for binary classification)

In [2]:
# Load the config file
configs = load_config("./finetune_config.yaml")

In [3]:
# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-BPE"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE
03:31:19 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE
03:31:19 - dnallm.models.model - INFO - Using mean pooling strategy.


Some weights of DNALLMforSequenceClassification were not initialized from the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE and are newly initialized: ['backbone.h.0.attn.c_attn.bias', 'backbone.h.0.attn.c_attn.weight', 'backbone.h.0.attn.c_proj.bias', 'backbone.h.0.attn.c_proj.weight', 'backbone.h.0.ln_1.bias', 'backbone.h.0.ln_1.weight', 'backbone.h.0.ln_2.bias', 'backbone.h.0.ln_2.weight', 'backbone.h.0.mlp.c_fc.bias', 'backbone.h.0.mlp.c_fc.weight', 'backbone.h.0.mlp.c_proj.bias', 'backbone.h.0.mlp.c_proj.weight', 'backbone.h.1.attn.c_attn.bias', 'backbone.h.1.attn.c_attn.weight', 'backbone.h.1.attn.c_proj.bias', 'backbone.h.1.attn.c_proj.weight', 'backbone.h.1.ln_1.bias', 'backbone.h.1.ln_1.weight', 'backbone.h.1.ln_2.bias', 'backbone.h.1.ln_2.weight', 'backbone.h.1.mlp.c_fc.bias', 'backbone.h.1.mlp.c_fc.weight', 'backbone.h.1.mlp.c_proj.bias', 'backbone.h.1.mlp.c_proj.weight', 'backbone.h.10.attn.c_attn.bias', 'backbone.h.10.attn.

In [4]:
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from Hugging Face
# datasets = DNADataset.from_huggingface(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)

# sample datasets
sampled_datasets = datasets.sampling(0.1, overwrite=True)

# Encode the datasets
sampled_datasets.encode_sequences()

Encoding inputs:   0%|          | 0/6656 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

In [5]:
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)

In [6]:
# Start training
metrics = trainer.train()
print(metrics)

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auroc,Auprc,Tpr,Tnr,Fpr,Fnr
100,0.7169,0.691902,0.528846,0.528846,1.0,0.691824,0.0,0.561856,0.58317,1.0,0.0,1.0,0.0
200,0.6954,0.694628,0.512019,0.60119,0.229545,0.332237,0.0729,0.582972,0.602086,0.229545,0.829082,0.170918,0.770455
300,0.6597,0.688271,0.627404,0.620818,0.759091,0.683027,0.249245,0.653606,0.653855,0.759091,0.479592,0.520408,0.240909
400,0.5961,0.669269,0.629808,0.644105,0.670455,0.657016,0.25552,0.660569,0.664381,0.670455,0.584184,0.415816,0.329545


{'train_runtime': 481.2868, 'train_samples_per_second': 41.489, 'train_steps_per_second': 0.866, 'total_flos': 5298226681872384.0, 'train_loss': 0.6638245388186521, 'epoch': 3.0}


In [7]:
# Do prediction on the test set
results = trainer.infer()
results.metrics

{'test_loss': 0.7007516622543335,
 'test_accuracy': 0.5949519230769231,
 'test_precision': 0.5698924731182796,
 'test_recall': 0.6592039800995025,
 'test_f1': 0.6113033448673587,
 'test_mcc': 0.19533756136885358,
 'test_AUROC': 0.6343283582089552,
 'test_AUPRC': 0.6134710137186506,
 'test_TPR': 0.6592039800995025,
 'test_TNR': 0.5348837209302325,
 'test_FPR': 0.46511627906976744,
 'test_FNR': 0.3407960199004975,
 'test_runtime': 6.5196,
 'test_samples_per_second': 127.615,
 'test_steps_per_second': 2.761}

### Model that is not compatible with the Transformer library (megaDNA)

In [None]:
# Change head config in the config file
configs['task'].head_config.head = "megadna"
# Change saved model path
configs['finetune'].output_dir = "./outputs_megadna"

In [None]:
# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/megaDNA_updated
03:39:40 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/megaDNA_updated
03:39:41 - dnallm.models.model - INFO - Using mean pooling strategy.


In [10]:
# Load the datasets
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=1024)
sampled_datasets = datasets.sampling(0.1, overwrite=True)
sampled_datasets.encode_sequences()

Encoding inputs:   0%|          | 0/6656 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

In [11]:
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)

In [12]:
# Start training
metrics = trainer.train()
print(metrics)

  self.gen = func(*args, **kwds)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auroc,Auprc,Tpr,Tnr,Fpr,Fnr
100,0.6994,0.67493,0.576923,0.584821,0.61215,0.598174,0.152141,0.60647,0.613311,0.61215,0.539604,0.460396,0.38785
200,0.6757,0.667276,0.58774,0.592191,0.63785,0.614173,0.17345,0.626203,0.625642,0.63785,0.534653,0.465347,0.36215
300,0.6468,0.674051,0.597356,0.662021,0.443925,0.531469,0.214306,0.664928,0.658847,0.443925,0.759901,0.240099,0.556075
400,0.6189,0.667258,0.610577,0.668831,0.481308,0.559783,0.236859,0.673285,0.666627,0.481308,0.747525,0.252475,0.518692


  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)
  self.gen = func(*args, **kwds)


{'train_runtime': 114.7143, 'train_samples_per_second': 174.067, 'train_steps_per_second': 3.635, 'total_flos': 1.8043379178799104e+16, 'train_loss': 0.6572908154494471, 'epoch': 3.0}


In [13]:
# Do prediction on the test set
results = trainer.infer()
results.metrics

  self.gen = func(*args, **kwds)


{'test_loss': 0.6516980528831482,
 'test_accuracy': 0.6213942307692307,
 'test_precision': 0.6920289855072463,
 'test_recall': 0.45368171021377673,
 'test_f1': 0.5480631276901005,
 'test_mcc': 0.26214204444019135,
 'test_AUROC': 0.6950950985661528,
 'test_AUPRC': 0.6844861256476427,
 'test_TPR': 0.45368171021377673,
 'test_TNR': 0.7931873479318735,
 'test_FPR': 0.20681265206812652,
 'test_FNR': 0.5463182897862233,
 'test_runtime': 1.2112,
 'test_samples_per_second': 686.915,
 'test_steps_per_second': 14.861}