# TransGenic: Single Sequence Prediction

To test TransGenic with an example DNA sequence, first load a pretrained model and the input and output tokenizers from HuggingFace.

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the model
model_name = "jlomas/HyenaTransgenic-512L9A4-160M"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

# Load the output tokenizer
gffTokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load the input tokenizer
dnaTokenizer = AutoTokenizer.from_pretrained("LongSafari/hyenadna-large-1m-seqlen-hf", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-17 13:19:26,643] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/pgl/scratch1/jlomas/miniforge3/envs/transgenic/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/pgl/scratch1/jlomas/miniforge3/envs/transgenic/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


Tokenize an input DNA sequence.

In [2]:
# AT1G58150.TAIR10 gene sequence
seq = 'GCTTATGTTTATCTTTTGATCTGATCTATAAATATATATACAGGTTATCAAAAGGCCTCCACCAAAACCAACTCAACATCTCCGCCTCCATCTCCGCCTCCATCTCCGCCGCGAGTTCCAGACGCTCAAGAATTGGAGTACCTTAAATCCGACTCTTTTCCCGAACACGATGCGTAGAGTTGTCATTCGGACGGAGGTGTGCGTTCCGATAAAATTAGGCTACCGCCGCGGCTTTCAGACCTTCTAGAATTGGAGAAATTGTTTCCCGAACGCGAGGCGCTGAGTTGTCCTTTGGACGGAGATGAGGATTCCAATGAACTTAGGCTACGGCCGCTGGTTCCAGACGCTCAAGAATGGAAGTACCCTAAATCCAAGTTATTTCCCAGACACGCGGCGTGGAGTTGTCATTCGGGCGGAGGTGGAGGTGGAGGCGGTGGCCGTGTATTTACAAATAAAGTAAATGCGGTAGAAGAATTCAACTTAGGAGGACTGAAGGACAGCGAATCCGATTCCGATTCCGAGTAGGGAACTTTTAAAACAACTTTGATTATGGATTTCGATATCCAGAATAATTTTAATTCACTGCTGTTGGACTTGATTAATTTCCTATCACATAACGTTTTGGTTTAACTTTGTACGACCACCA'

# Tokenize the input sequences and remove the [SEP] token
seqs = dnaTokenizer.batch_encode_plus(
    [seq], 
    return_tensors="pt")["input_ids"][:, :-1]

Predict the annotation and convert it to GSF format.

In [3]:
model.eval()
if torch.cuda.is_available():
    seqs = seqs.to("cuda")
    model.to("cuda")

# Prediction
outputs = model.generate(
    inputs=seqs,  
    num_return_sequences=1, 
    max_length=2048, 
    num_beams=2,
    do_sample=True,
    decoder_input_ids = None
)

# Convert to GSF
prediction = gffTokenizer.batch_decode(
    outputs.detach().cpu().numpy(), 
    skip_special_tokens=True
)

print(prediction)

['<s>2937|CDS1|3207|+|A>CDS1']


Convert the output to GFF for downstream analysis.

In [4]:
import os, sys

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))
from utils.gsf import gffString2GFF3

gff = gffString2GFF3(prediction[0], "Chr5", 1234, "gene_model=AT1G58150.TAIR10")
for i in gff:
	print(i)

Chr5	transgenic	gene	4172	4441	.	+	.	ID=fefa3b58-7231-4b1a-884c-880be647d7af;gene_model=AT1G58150.TAIR10
Chr5	transgenic	mRNA	4172	4441	.	+	.	ID=fefa3b58-7231-4b1a-884c-880be647d7af.t1;Parent=fefa3b58-7231-4b1a-884c-880be647d7af;gene_model=AT1G58150.TAIR10
Chr5	transgenic	CDS	4172	4441	.	+	0	ID=fefa3b58-7231-4b1a-884c-880be647d7af.t1.CDS1;Parent=fefa3b58-7231-4b1a-884c-880be647d7af.t1;gene_model=AT1G58150.TAIR10
