In [1]:
from dnallm.datahandling import *

### Use the preset dataset to quickly try the finetuning pipeline

In [2]:
# Display preset datasets
show_preset_dataset()

{'nucleotide_transformer_downstream_tasks': {'name': 'lgq12697/nucleotide_transformer_downstream_tasks',
  'description': 'A collection of nucleotide transformer downstream tasks datasets from NT.',
  'reference': 'https://doi.org/10.1038/s41592-024-02523-z',
  'tasks': ['enhancers',
   'enhancers_types',
   'H2AFZ',
   'H3K27ac',
   'H3K27me3',
   'H3K36me3',
   'H3K4me1',
   'H3K4me2',
   'H3K4me3',
   'H3K9ac',
   'H3K9me3',
   'H4K20me1',
   'promoter_all',
   'promoter_no_tata',
   'promoter_tata',
   'splice_sites_acceptors',
   'splice_sites_all',
   'splice_sites_donors'],
  'default_task': 'enhancers',
  'format': 'csv',
  'sequence': 'sequence',
  'label': 'label',
  'separator': ',',
  'multi_separator': ';'},
 'GUE': {'name': 'lgq12697/GUE',
  'description': 'A dataset for Genome Understanding Evaluation (GUE) tasks from DNABERT-2.',
  'reference': 'https://doi.org/10.48550/arXiv.2306.15006',
  'tasks': ['emp_H3',
   'emp_H3K14ac',
   'emp_H3K36me3',
   'emp_H3K4me1',
   'e

In [3]:
# Load a preset dataset
dataset = load_preset_dataset(dataset_name='plant-genomic-benchmark', task='promoter_strength.leaf')

Preset dataset 'plant-genomic-benchmark' with 'promoter_strength.leaf' tasks loaded.


In [4]:
# Display the dataset samples
dataset.show(head=1)

Dataset: train
{0: {'name': 'ENSRNA049481990_Sb',
     'sequence': 'CGTTTGGGTATGGACATTTAGACTTGTCGTGTTCCTGATGCCTCCCATTCCTATGGTTCTTAGGTGCTCCTTCCTCTTCCTTTCGCTAGCGCAATTGATTTAGTGATGAACACAATATACATTCCAAAGCACATAGTTAGATGAGAGCCTGATGGCAATTGGCAAGTCAG',
     'labels': -0.2166846610255508}}
Dataset: test
{0: {'name': 'AT5G03425_At',
     'sequence': 'TGAGTGAAGGCAGAATTGACCCATGCAGCTTCCTTTCTTTCACCACTCACTTGCTAGGAAACTACAAAAATAGAAAAAGAAAACTCACGGCAACCAAAAACGCGAACTCCTAGAGGGTTTCGAACACTTTGAAATTTGTATCAGACATCAAATGAAATCTTTAACTTCTT',
     'labels': -0.5374512291279694}}


In [5]:
# Display dataset statistics
dataset.statistics()

{'train': {'data_type': 'regression',
  'n_samples': 58179,
  'min_len': 170,
  'max_len': 170,
  'mean_len': 170.0,
  'median_len': 170.0},
 'test': {'data_type': 'regression',
  'n_samples': 7154,
  'min_len': 170,
  'max_len': 170,
  'mean_len': 170.0,
  'median_len': 170.0}}

In [6]:
# Visualize dataset statistics
dataset.plot_statistics()

Successfully plotted dataset statistics.


In [7]:
## Pre-processing of dataset
from dnallm import load_config, load_model_and_tokenizer

# Load configuration
configs = load_config("finetune_config.yaml")

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(
    "zhangtaolab/plant-dnabert-BPE", 
    task_config=configs["task"], 
    source="modelscope"
)

# Tokenize the dataset
dataset.encode_sequences(tokenizer=tokenizer)

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE
Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
## Start fine-tuning
# from dnallm.finetune import DNATrainer

# trainer = DNATrainer(
#     config=configs,
#     model=model,
#     datasets=dataset
# )
# trainer.train()

### Load datasets from Hugging Face or ModelScope

In [None]:
# Load tokenizer for demonstration
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("zhangtaolab/plant-dnabert-BPE")

In [None]:
# Load dataset from Hugging Face
dataset = DNADataset.from_huggingface(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)

In [9]:
# Load dataset from ModelScope
dataset = DNADataset.from_modelscope(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)

### Load datasets from local paths

In [10]:
# Load single dataset
dataset = DNADataset.load_local_data(
    "../../../../tests/test_data/regression/train.csv", 
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)

Format labels:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
# Load multiple files (e.g., pre-split datasets)
dataset = DNADataset.load_local_data(
    {
        "train": "../../../../tests/test_data/regression/train.csv", 
        "test": "../../../../tests/test_data/regression/test.csv", 
        "validation": "../../../../tests/test_data/regression/dev.csv"
    },
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)

Format labels:   0%|          | 0/100 [00:00<?, ? examples/s]

Format labels:   0%|          | 0/100 [00:00<?, ? examples/s]

### Manually prepare dataset for fine-tuning

In [None]:
## Binary classification example
# For binary classification, labels should be 0 and 1
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1
GCTAGCTAGCTA,0
TATATATATATA,1
'''

In [None]:
## Multi-class classification example
# For multi-class classification, labels should be integers starting from 0
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0
GCTAGCTAGCTA,1
TATATATATATA,2
CGCGCGCGCGCG,3
'''

In [None]:
## Multi-label classification example
# For multi-label classification, labels should be a list of integers separated by specific delimiters (e.g., ";")
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1;0;1;0;0
GCTAGCTAGCTA,0;1;0;1;0
TATATATATATA,1;1;0;0;1
'''

In [None]:
## Regression example
# For regression, labels should be float numbers
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0.85
GCTAGCTAGCTA,0.23
TATATATATATA,0.67
'''

In [None]:
## Token classification example / Named Entity Recognition (NER)
# For token classification, please refer to example/notebooks/finetune_NER_task/data_generation_and_inference.ipynb

In [None]:
## Masked language modeling (MLM) example
# For MLM, It only needs one column "sequence" without labels since this is a self-supervised task.
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''

In [None]:
## Generation example / Causal Language Modeling (CLM)
# For CLM, It needs two columns "input_sequence" and "output_sequence"
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''

In [None]:
## These sequences and labels can be saved as a CSV file for loading.
# dataset = DNADataset.load_local_data(
#     "data.csv", 
#     seq_col="sequence", 
#     label_col="label",
#     tokenizer=tokenizer, 
#     max_length=512
# )