# TransGenic: Single Sequence Prediction

To test TransGenic with an example DNA sequence, first load a pretrained model and the input and output tokenizers from HuggingFace.

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the model
model_name = "jlomas/HyenaTransgenic-512L9A4-160M"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

# Load the output tokenizer
gffTokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load the input tokenizer
dnaTokenizer = AutoTokenizer.from_pretrained("LongSafari/hyenadna-large-1m-seqlen-hf", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-18 11:13:43,988] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/pgl/scratch1/jlomas/miniforge3/envs/transgenic/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/pgl/scratch1/jlomas/miniforge3/envs/transgenic/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


Tokenize an input DNA sequence.

In [2]:
# AT1G58150.TAIR10 gene sequence
seq = 'GCTTATGTTTATCTTTTGATCTGATCTATAAATATATATACAGGTTATCAAAAGGCCTCCACCAAAACCAACTCAACATCTCCGCCTCCATCTCCGCCTCCATCTCCGCCGCGAGTTCCAGACGCTCAAGAATTGGAGTACCTTAAATCCGACTCTTTTCCCGAACACGATGCGTAGAGTTGTCATTCGGACGGAGGTGTGCGTTCCGATAAAATTAGGCTACCGCCGCGGCTTTCAGACCTTCTAGAATTGGAGAAATTGTTTCCCGAACGCGAGGCGCTGAGTTGTCCTTTGGACGGAGATGAGGATTCCAATGAACTTAGGCTACGGCCGCTGGTTCCAGACGCTCAAGAATGGAAGTACCCTAAATCCAAGTTATTTCCCAGACACGCGGCGTGGAGTTGTCATTCGGGCGGAGGTGGAGGTGGAGGCGGTGGCCGTGTATTTACAAATAAAGTAAATGCGGTAGAAGAATTCAACTTAGGAGGACTGAAGGACAGCGAATCCGATTCCGATTCCGAGTAGGGAACTTTTAAAACAACTTTGATTATGGATTTCGATATCCAGAATAATTTTAATTCACTGCTGTTGGACTTGATTAATTTCCTATCACATAACGTTTTGGTTTAACTTTGTACGACCACCA'

# Tokenize the input sequences and remove the [SEP] token
seqs = dnaTokenizer.batch_encode_plus(
    [seq], 
    return_tensors="pt")["input_ids"][:, :-1]

Predict the annotation and convert it to GSF format.

In [3]:
model.eval()
if torch.cuda.is_available():
    seqs = seqs.to("cuda")
    model.to("cuda")

# Prediction
outputs = model.generate(
    inputs=seqs,  
    num_return_sequences=1, 
    max_length=2048, 
    num_beams=2,
    do_sample=True,
    decoder_input_ids = None
)

# Convert to GSF
prediction = gffTokenizer.batch_decode(
    outputs.detach().cpu().numpy(), 
    skip_special_tokens=True
)

print(prediction)

['<s>2633|CDS1|3512|+|A>CDS1']


Convert the output to GFF for downstream analysis.

In [4]:
from transgenic.utils.gsf import gffString2GFF3

gff = gffString2GFF3(prediction[0], "Chr5", 1234, "gene_model=AT1G58150.TAIR10")
for i in gff:
	print(i)

Chr5	transgenic	gene	3868	4746	.	+	.	ID=08ca7c0a-7e8f-43f6-9f87-9cdbd5f2a957;gene_model=AT1G58150.TAIR10
Chr5	transgenic	mRNA	3868	4746	.	+	.	ID=08ca7c0a-7e8f-43f6-9f87-9cdbd5f2a957.t1;Parent=08ca7c0a-7e8f-43f6-9f87-9cdbd5f2a957;gene_model=AT1G58150.TAIR10
Chr5	transgenic	CDS	3868	4746	.	+	0	ID=08ca7c0a-7e8f-43f6-9f87-9cdbd5f2a957.t1.CDS1;Parent=08ca7c0a-7e8f-43f6-9f87-9cdbd5f2a957.t1;gene_model=AT1G58150.TAIR10
