In [None]:
from dnallm import load_config
from dnallm import load_model_and_tokenizer, DNAPredictor
from dnallm import Benchmark

In [None]:
# 读取配置文件
configs = load_config("./inference_config.yaml")

### 模型推理

In [None]:
# 读取模型和分词器 (从ModelScope下载)
model_name = "zhangtaolab/plant-dnagpt-BPE"
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

In [None]:
# 创建预测器
predictor = DNAPredictor(
    model=model,
    tokenizer=tokenizer,
    config=configs
)

In [None]:
# 输入序列进行预测
seqs = ["ACCGTAGATTGAACGCGGGAGGCAACAGGCTAAATCGTCCGTTCAGCCAAAACGGAATCATGGGCTGTTTTTCCAGAAGGCT",
        "TATATGTTCGCGGGTATAAAATCTTACCCATATTCGTACGCGCGCGGGTATTTTTACCCGTCGGGTAACCCGTACCTGCTAGGAAAGTTAAAAATTCCAATATACTAATAA"]
results = predictor.predict_seqs(seqs)
print(results)

In [None]:
# 读取文件进行预测
seq_file = './test.csv'
results, metrics = predictor.predict_file(seq_file, evaluate=True)
print(metrics)

### 模型基准测试

In [None]:
# 初始化基准测试
benchmark = Benchmark(config=configs)

In [None]:
# 获取数据集
dataset = benchmark.get_dataset("./test.csv", seq_col="sequence", label_col="label")

In [None]:
# 指定模型
model_names = {
    "Plant DNABERT": "zhangtaolab/plant-dnabert-BPE-promoter",
    "Plant DNAGPT": "zhangtaolab/plant-dnagpt-BPE-promoter",
    "Plant NT": "zhangtaolab/plant-nucleotide-transformer-BPE-promoter",
    "Nucleotide Transformer": "zhangtaolab/nucleotide-transformer-v2-100m-promoter",
    "DNABERT-2": "zhangtaolab/dnabert2-promoter",
}

In [None]:
# 运行基准测试
metrics = benchmark.run(model_names, source="modelscope")

In [None]:
# 画图（pbar：各种得分柱状图；pline：ROC曲线）
pbar, pline = benchmark.plot(metrics, save_path='plot.pdf')

In [None]:
# 在Notebook中展示图
pbar

In [None]:
pline