# TransGenic: Single Sequence Prediction

To test TransGenic with an example DNA sequence, first load a pretrained model and the input and output tokenizers from HuggingFace.

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the model
model_name = "jlomas/HyenaTransgenic-768L12A6-400M"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

# Load the output tokenizer
gffTokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load the input tokenizer
dnaTokenizer = AutoTokenizer.from_pretrained("LongSafari/hyenadna-large-1m-seqlen-hf", trust_remote_code=True)

Tokenize an input DNA sequence.

In [5]:
# AT1G58150.TAIR10 gene sequence
# seq = 'GCTTATGTTTATCTTTTGATCTGATCTATAAATATATATACAGGTTATCAAAAGGCCTCCACCAAAACCAACTCAACATCTCCGCCTCCATCTCCGCCTCCATCTCCGCCGCGAGTTCCAGACGCTCAAGAATTGGAGTACCTTAAATCCGACTCTTTTCCCGAACACGATGCGTAGAGTTGTCATTCGGACGGAGGTGTGCGTTCCGATAAAATTAGGCTACCGCCGCGGCTTTCAGACCTTCTAGAATTGGAGAAATTGTTTCCCGAACGCGAGGCGCTGAGTTGTCCTTTGGACGGAGATGAGGATTCCAATGAACTTAGGCTACGGCCGCTGGTTCCAGACGCTCAAGAATGGAAGTACCCTAAATCCAAGTTATTTCCCAGACACGCGGCGTGGAGTTGTCATTCGGGCGGAGGTGGAGGTGGAGGCGGTGGCCGTGTATTTACAAATAAAGTAAATGCGGTAGAAGAATTCAACTTAGGAGGACTGAAGGACAGCGAATCCGATTCCGATTCCGAGTAGGGAACTTTTAAAACAACTTTGATTATGGATTTCGATATCCAGAATAATTTTAATTCACTGCTGTTGGACTTGATTAATTTCCTATCACATAACGTTTTGGTTTAACTTTGTACGACCACCA'
with open('/home/pgl/data/S.lycopersicum ITAG5.0|Solyc01G000065|1:565133..584148.txt', 'r') as seq_file:
    seq = seq_file.readline()
    print(seq[:200])
# Tokenize the input sequences and remove the [SEP] token
seqs = dnaTokenizer.batch_encode_plus(
    [seq], 
    return_tensors="pt")["input_ids"][:, :-1]

ATGATCTATTTCGAGAATGATCTTTATCCACGGTAACAGTAAATTCATAAAAATTATTTGCCTTAAACTTACTAAATTTTGCTCATTATTCTGATTAATGTCAACTAGTCTCCAGGATTCAAATATTTGCCCTTCAAAAGCAAACTTTTTCTCTCCAACAAGAGACTTTGTTTTCATGTTAGAATTTTATTATTACTTTA


Predict the annotation and convert it to GSF format.

In [6]:
model.eval()
if torch.cuda.is_available():
    seqs = seqs.to("cuda")
    model.to("cuda")

# Prediction
outputs = model.generate(
    inputs=seqs,  
    num_return_sequences=1, 
    max_length=2048, 
    num_beams=2,
    do_sample=True,
    decoder_input_ids = None
)

# Convert to GSF
prediction = gffTokenizer.batch_decode(
    outputs.detach().cpu().numpy(), 
    skip_special_tokens=True
)

print(prediction)

    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
  queued_call()


['<s>1156|CDS1|4132|+|A;5459|CDS2|5756|+|A;6015|CDS3|6180|+|A;6252|CDS4|7944|+|A;16842|CDS5|17276|+|A>CDS1|CDS2|CDS3|CDS4|CDS5']


Convert the output to GFF for downstream analysis.

In [16]:
from transgenic.utils.gsf import gffString2GFF3
gff = gffString2GFF3(prediction[0], "1", 565133, "gene_model=Solyc01G000065.ITAG5.0")
for i in gff:
	print(i)
with open('/home/pgl/results/transgenic/S.lycopersicum_ITAG5.0_01G000065_single_seq_results.gff3','w') as result:
	for i in gff:
		result.write(i+"\n")

1	transgenic	gene	566290	582409	.	+	.	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	mRNA	566290	582409	.	+	.	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1;Parent=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	CDS	566290	569265	.	+	0	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1.CDS1;Parent=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	CDS	570593	570889	.	+	0	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1.CDS2;Parent=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	CDS	571149	571313	.	+	0	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1.CDS3;Parent=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	CDS	571386	573077	.	+	0	ID=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1.CDS4;Parent=095f5cf5-59e9-4c3f-8267-e76fb9a6baf0.t1;gene_model=Solyc01G000065.ITAG5.0
1	transgenic	CDS	581976	582409	.	+	0	ID=095f5cf5-59e9-4c3f-8267-e