In [4]:
#!/usr/bin/env python3
"""
Script to load and inspect pickle files from the phylogenetic GFlowNet repository.
This script can handle both sequence data and other pickle files in the repository.
"""

import pickle
import os
import sys
from typing import Any, Dict, List, Union
import numpy as np

def load_pickle_file(filepath: str) -> Any:
    """
    Load a pickle file and return its contents.
    
    Args:
        filepath (str): Path to the pickle file
        
    Returns:
        Any: Contents of the pickle file
    """
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        return data
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

In [15]:
from transformers.models.bert.configuration_bert import BertConfig
import torch
from transformers import AutoTokenizer, AutoModel
import pickle
import os
import sys
from typing import Any, Dict, List, Union
import numpy as np

path = 'dataset/benchmark_datasets/DS1.pickle'
dnadata = load_pickle_file(path)
#print(data)
dnakeys = list(dnadata.keys())
print(dnakeys)

['Alligator_mississippiensis', 'Ambystoma_mexicanum', 'Amphiuma_tridactylum', 'Bufo_valliceps', 'Discoglossus_pictus', 'Eleutherodactylus_cuneatus', 'Gallus_gallus', 'Gastrophryne_carolinensis', 'Grandisonia_alternans', 'Heterodon_platyrhinos', 'Homo_sapiens', 'Hyla_cinerea', 'Hypogeophis_rostratus', 'Ichthyophis_bannanicus', 'Latimeria_chalumnae', 'Mus_musculus', 'Nesomantis_thomasseti', 'Oryctolagus_cuniculus', 'Plethodon_yonhalossee', 'Rattus_norvegicus', 'Scaphiopus_holbrooki', 'Sceloporus_undulatus', 'Siren_intermedia', 'Trachemys_scripta', 'Turdus_migratorius', 'Typhlonectes_natans', 'Xenopus_laevis']


In [16]:
import torch
from transformers import AutoTokenizer, AutoModel, BertConfig

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and move to GPU
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)
model = model.to(device)  # Move model to GPU

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)

# Process your DNA sequence
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
dna = dnadata[dnakeys[0]]
inputs = tokenizer(dna, return_tensors='pt')["input_ids"]
inputs = inputs.to(device)  # Move inputs to GPU

# Forward pass
hidden_states = model(inputs)[0]
print(hidden_states.shape)
print(hidden_states)

Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 791, 768])
tensor([[[ 0.0868,  0.1897,  0.3563,  ...,  0.5004,  0.5196,  0.0335],
         [ 0.0525,  0.0992,  0.0924,  ...,  0.4640,  0.3608, -0.1479],
         [-0.1916,  0.0549,  0.0168,  ...,  0.2166,  0.2230, -0.0431],
         ...,
         [ 0.2057, -0.1602,  0.1408,  ...,  0.2269,  0.2780, -0.3086],
         [ 0.1618, -0.1540,  0.1913,  ...,  0.1918,  0.2205, -0.2679],
         [-0.0283,  0.2904,  0.4328,  ...,  0.1829,  0.0898, -0.1776]]],
       device='cuda:0', grad_fn=<ViewBackward0>)




In [7]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
    
    # Test basic CUDA operations
    x = torch.randn(3, 3).cuda()
    y = torch.randn(3, 3).cuda()
    z = torch.mm(x, y)
    print("Basic CUDA operations work!")
    
print("PyTorch import and basic operations successful!")

PyTorch version: 2.5.1
CUDA available: True
CUDA version: 12.4
Number of GPUs: 2
Current GPU: NVIDIA GeForce RTX 4070 Ti SUPER
Basic CUDA operations work!
PyTorch import and basic operations successful!


In [8]:
import torch
import numpy as np
from ete3 import TreeNode
from torch import nn

# --- Helper Classes and Functions (Adapted from provided code & standard models) ---

class MockCfgNode:
    """A mock configuration class to replace OmegaConf/easydict objects."""
    def __init__(self, d=None):
        if d is None:
            d = {}
        for k, v in d.items():
            setattr(self, k, MockCfgNode(v) if isinstance(v, dict) else v)

CHARACTERS_MAPS = {
    'DNA': {
        'A': [1., 0., 0., 0.], 'C': [0., 1., 0., 0.],
        'G': [0., 0., 1., 0.], 'T': [0., 0., 0., 1.],
        'N': [1., 1., 1., 1.]
    }
}

class PhyloTreeReward:
    """Mock reward function class, not used in calculation but required for environment setup."""
    def __init__(self, reward_cfg):
        self.C = reward_cfg.C
        self.scale = reward_cfg.SCALE
    def __call__(self, log_score):
        return (self.C + log_score) / self.scale

class MockEdgeEnv:
    """Mock edge environment, as noise perturbation is not needed for likelihood calculation."""
    def generate_random_perturbation(self, edge_length, is_root):
        return 0.0

def build_edge_env(cfg):
    """Factory function for the mock edge environment."""
    return MockEdgeEnv()

class EvolutionModelTorch(nn.Module):
    """
    A PyTorch implementation of a DNA evolution model required by the environment.
    This example uses the Jukes-Cantor (JC69) model.
    """
    def __init__(self, model_name='JC69'):
        super().__init__()
        self.model_name = model_name
        # Rate matrix Q for JC69 model
        self.Q = torch.tensor([
            [-3, 1, 1, 1], [1, -3, 1, 1],
            [1, 1, -3, 1], [1, 1, 1, -3]
        ], dtype=torch.float32) / 4.0
        # Equilibrium frequencies pi for JC69 (uniform)
        self.pi = torch.tensor([0.25, 0.25, 0.25, 0.25], dtype=torch.float32)

    def get_transition_matrix(self, t):
        """Calculates the transition probability matrix P(t) = expm(Qt)."""
        return torch.matrix_exp(self.Q * t.item())

    def compute_partial_prob(self, data, at_root):
        """
        Implements Felsenstein's pruning algorithm to compute partial likelihoods at an internal node.
        
        Args:
            data (list): A list like [[child1_likelihoods, branch1_length], [child2_likelihoods, branch2_length]].
            at_root (bool): Flag indicating if the current node is the root of the tree.
        """
        child_probs = []
        for partial_likelihoods, branch_length in data:
            P = self.get_transition_matrix(branch_length)
            # Transformed likelihoods from child node: L_child @ P.T
            prob = torch.matmul(partial_likelihoods.squeeze(0), P.T)
            child_probs.append(prob)

        # Merge likelihoods from children by element-wise multiplication
        merged_likelihoods = child_probs[0]
        for i in range(1, len(child_probs)):
            merged_likelihoods *= child_probs[i]
        
        merged_likelihoods = merged_likelihoods.unsqueeze(0)

        if at_root:
            # For the root, calculate the final site likelihoods weighted by equilibrium frequencies
            likelihood_per_site = torch.sum(self.pi * merged_likelihoods.squeeze(0), dim=1)
            # Total log-likelihood is the sum of the log of site likelihoods
            log_score = torch.sum(torch.log(likelihood_per_site))
            return merged_likelihoods, log_score
        
        return merged_likelihoods, None

class PhylogenticTreeEnv(nn.Module):
    """
    The environment for phylogenetic tree operations, adapted from the provided source.
    This version is simplified to focus solely on the `compute_tree_log_score` method.
    """
    def __init__(self, cfg, sequences):
        super(PhylogenticTreeEnv, self).__init__()
        self.cfg = cfg
        self.sequences = sequences
        self.reward_fn = PhyloTreeReward(cfg.ENV.REWARD)
        self.chars_dict = CHARACTERS_MAPS[cfg.ENV.SEQUENCE_TYPE]
        seq_arrays = np.array([self.seq2array(seq) for seq in self.sequences])
        self.seq_arrays = torch.nn.Parameter(torch.tensor(seq_arrays, dtype=torch.float32), requires_grad=False)
        self.evolution_model = EvolutionModelTorch(cfg.ENV.EVOLUTION_MODEL)
        self.edge_env = build_edge_env(cfg)

    def seq2array(self, seq):
        """Converts a sequence string to a numpy array based on character maps."""
        return np.array([self.chars_dict[char] for char in seq])

    def compute_tree_log_score(self, ete_tree, with_noise):
        """
        Computes the log-likelihood of a given ete_tree object.

        Args:
            ete_tree (ete3.TreeNode): The tree to evaluate.
            with_noise (bool): If True, applies GFN-specific perturbations. Should be False for pure likelihood.

        Returns:
            A tuple containing the final log-likelihood tensor, the dictionary of feature vectors, 
            and the discrete factor (0 if with_noise is False).
        """
        feature_dict = {}
        final_log_score = None

        # Traverse the tree from leaves up to the root (postorder)
        for node in ete_tree.traverse("postorder"):
            node_id = int(node.name)
            if node.is_leaf():
                # For leaves, the feature is the one-hot encoded sequence data
                feature_dict[node_id] = self.seq_arrays[node_id].unsqueeze(0)
            else:
                # For internal nodes, gather data from children
                child_data = []
                for child in node.children:
                    child_id = int(child.name)
                    # Data includes child's partial likelihoods and its connecting branch length
                    child_data.append(
                        [feature_dict[child_id], torch.tensor([child.dist])]
                    )
                
                # Compute the partial likelihood for the current node
                feature, log_score = self.evolution_model.compute_partial_prob(child_data, node.is_root())
                feature_dict[node_id] = feature
                if node.is_root():
                    final_log_score = log_score
        
        return final_log_score, feature_dict, 0.0

# --- Main Evaluation Function ---

def evaluate_tree_log_probability(env, newick_tree_string):
    """
    High-level function to evaluate the log-likelihood of a phylogenetic tree.

    Args:
        env: object of PhylogenticTreeEnv that contains sequences and model
        newick_tree_string (str): The tree in Newick format. Leaf names must be
                                  string integers matching sequence indices.
    """
    

    # 3. Parse the Newick string into an ete3 tree object
    ete_tree = TreeNode(newick_tree_string)
    
    # 4. Name internal nodes, which is required for the feature dictionary keys
    internal_node_counter = num_sequences
    for node in ete_tree.traverse("preorder"):
        if not node.is_leaf() and not node.name:
            node.name = str(internal_node_counter)
            internal_node_counter += 1

    print("--- Evaluating Tree ---")
    print(ete_tree.get_ascii(attributes=["name", "dist"]))
    print("-" * 25)

    # 5. Compute the log score (log-likelihood)
    # Set with_noise=False to get the pure likelihood without GFN-related perturbations.
    log_score_tensor, _, _ = env.compute_tree_log_score(ete_tree, with_noise=False)
    log_likelihood = log_score_tensor.item()

    print(f"✅ Calculated Log-Likelihood: {log_likelihood:.4f}")
    return log_likelihood


if __name__ == '__main__':
    # --- Example Usage ---

    # 1. Define the aligned DNA sequences
    dna_sequences = [
        "AGAACT",  # Corresponds to leaf "0"
        "AGATGT",  # Corresponds to leaf "1"
        "CGAACT",  # Corresponds to leaf "2"
        "CGATGT",  # Corresponds to leaf "3"
    ]

    # 2. Define the tree in Newick format with branch lengths.
    # Note: Leaf names must match the indices of the sequences list.
    newick_tree = "((0:0.1, 1:0.1):0.2, (2:0.1, 3:0.1):0.2);"

    # 3. Evaluate the log-likelihood of the tree

    sequences = dna_sequences

    num_sequences = len(sequences)
    if not sequences:
        raise ValueError("Sequence list cannot be empty.")
    seq_len = len(sequences[0])

    # 3.1. Configure parameters for the environment
    cfg = MockCfgNode({
        'ENV': {
            'REWARD': {'C': 0, 'SCALE': 1.0},
            'SEQUENCE_TYPE': 'DNA',
            'EVOLUTION_MODEL': 'JC69'
        }
    })

    # 3.2. Initialize the phylogenetic environment
    env = PhylogenticTreeEnv(cfg, sequences)

    # 3.3. Evaluate the tree log-likelihood
    evaluate_tree_log_probability(
        env=env,
        newick_tree_string=newick_tree
    )

--- Evaluating Tree ---

            /-0, 0.1
      /5, 0.2
     |      \-1, 0.1
-4, 0.0
     |      /-2, 0.1
      \6, 0.2
            \-3, 0.1
-------------------------
✅ Calculated Log-Likelihood: -27.0373


In [9]:
#!/usr/bin/env python3
import torch
import numpy as np
import pickle
from ete3 import TreeNode
from torch import nn
from transformers import AutoTokenizer, AutoModel

# PyTorch Geometric for the Tree GNN
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data

# --- PART 1: DNA Sequence Embedding (Adapted from your notebook) ---

def get_dna_embeddings(sequences, model, tokenizer, device):
    """
    Generates embeddings for a list of DNA sequences using DNABERT.

    Args:
        sequences (list[str]): A list of DNA sequences.
        model: The pre-trained DNABERT model.
        tokenizer: The DNABERT tokenizer.
        device: The torch device (e.g., 'cuda' or 'cpu').

    Returns:
        torch.Tensor: A tensor of shape [num_sequences, embedding_dim]
    """
    print("--- Part 1: Generating DNA Sequence Embeddings ---")
    all_embeddings = []
    for i, seq in enumerate(sequences):
        print(f"  Processing sequence {i+1}/{len(sequences)}...")
        inputs = tokenizer(seq, return_tensors='pt')["input_ids"].to(device)
        with torch.no_grad():
            hidden_states = model(inputs)[0] # [1, sequence_length, 768]
        
        # Use mean pooling to get a fixed-size embedding for the sequence
        sequence_embedding = hidden_states.mean(dim=1)
        all_embeddings.append(sequence_embedding)
    
    print("✅ Embeddings generated.\n")
    return torch.cat(all_embeddings, dim=0)

# --- PART 2: Tree Embedding with a Graph Neural Network ---

class TreeGNN(nn.Module):
    """
    A Graph Neural Network to generate an embedding for a phylogenetic tree.
    It takes node features (from DNABERT) and the tree structure as input.
    """
    def __init__(self, input_dim, hidden_dim=128, output_dim=64):
        super(TreeGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        """
        Forward pass for the GNN.

        Args:
            data (torch_geometric.data.Data): A graph data object with attributes:
                - x: Node feature matrix [num_nodes, input_dim]
                - edge_index: Graph connectivity [2, num_edges]
                - batch: Batch vector [num_nodes] for pooling
        """
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # GCN layers
        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv2(x, edge_index))

        # Global pooling to get a single vector for the entire graph (tree)
        graph_embedding = global_mean_pool(x, batch)
        
        # Final linear layer
        tree_vector = self.output_layer(graph_embedding)
        return tree_vector

def newick_to_graph_data(newick_string, leaf_embeddings):
    """
    Converts a Newick tree string and leaf embeddings into a PyG Data object.

    Args:
        newick_string (str): The tree in Newick format.
        leaf_embeddings (torch.Tensor): A tensor of embeddings for the leaf nodes.

    Returns:
        torch_geometric.data.Data: The graph representation for the GNN.
    """
    ete_tree = TreeNode(newick_string)
    
    # Map nodes to integer indices
    node_map = {node: i for i, node in enumerate(ete_tree.traverse("preorder"))}
    num_nodes = len(node_map)
    
    # Create the node feature matrix 'x'
    embedding_dim = leaf_embeddings.shape[1]
    x = torch.zeros((num_nodes, embedding_dim), dtype=torch.float32)

    # Build the edge index (adjacency list)
    edge_list = []
    for node in ete_tree.traverse():
        if not node.is_root():
            parent_idx = node_map[node.up]
            child_idx = node_map[node]
            edge_list.append([parent_idx, child_idx])
            edge_list.append([child_idx, parent_idx]) # Add edges in both directions

        # Assign pre-computed embeddings to leaf nodes
        if node.is_leaf():
            leaf_idx = int(node.name)
            x[node_map[node]] = leaf_embeddings[leaf_idx]

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    
    return Data(x=x, edge_index=edge_index)


# --- PART 3: Likelihood Evaluation as Reward (Adapted from your notebook) ---
# NOTE: This section is a direct copy of the necessary classes and functions
# from your `evaluate_tree_log_probability` code for self-containment.

class MockCfgNode:
    def __init__(self, d=None):
        if d is None: d = {}
        for k, v in d.items():
            setattr(self, k, MockCfgNode(v) if isinstance(v, dict) else v)

CHARACTERS_MAPS = {'DNA': {'A': [1.,0.,0.,0.], 'C': [0.,1.,0.,0.], 'G': [0.,0.,1.,0.], 'T': [0.,0.,0.,1.], 'N': [1.,1.,1.,1.]}}

class PhyloTreeReward:
    def __init__(self, reward_cfg):
        self.C = reward_cfg.C
        self.scale = reward_cfg.SCALE
    def __call__(self, log_score):
        return (self.C + log_score) / self.scale

class EvolutionModelTorch(nn.Module):
    def __init__(self, model_name='JC69'):
        super().__init__()
        self.Q = torch.tensor([[-3,1,1,1],[1,-3,1,1],[1,1,-3,1],[1,1,1,-3]], dtype=torch.float32) / 4.0
        self.pi = torch.tensor([0.25, 0.25, 0.25, 0.25], dtype=torch.float32)
    def get_transition_matrix(self, t):
        return torch.matrix_exp(self.Q * t.item())
    def compute_partial_prob(self, data, at_root):
        child_probs = [torch.matmul(p.squeeze(0), self.get_transition_matrix(bl).T) for p, bl in data]
        merged_likelihoods = child_probs[0]
        for i in range(1, len(child_probs)): merged_likelihoods *= child_probs[i]
        merged_likelihoods = merged_likelihoods.unsqueeze(0)
        if at_root:
            log_score = torch.sum(torch.log(torch.sum(self.pi * merged_likelihoods.squeeze(0), dim=1)))
            return merged_likelihoods, log_score
        return merged_likelihoods, None

class PhylogenticTreeEnv(nn.Module):
    def __init__(self, cfg, sequences):
        super(PhylogenticTreeEnv, self).__init__()
        self.cfg = cfg
        self.sequences = sequences
        self.chars_dict = CHARACTERS_MAPS[cfg.ENV.SEQUENCE_TYPE]
        seq_arrays = np.array([np.array([self.chars_dict[c] for c in s]) for s in self.sequences])
        self.seq_arrays = torch.nn.Parameter(torch.tensor(seq_arrays, dtype=torch.float32), requires_grad=False)
        self.evolution_model = EvolutionModelTorch(cfg.ENV.EVOLUTION_MODEL)
    def compute_tree_log_score(self, ete_tree):
        feature_dict = {}
        final_log_score = None
        for node in ete_tree.traverse("postorder"):
            node_id = int(node.name)
            if node.is_leaf():
                feature_dict[node_id] = self.seq_arrays[node_id].unsqueeze(0)
            else:
                child_data = [[feature_dict[int(c.name)], torch.tensor([c.dist])] for c in node.children]
                feature, log_score = self.evolution_model.compute_partial_prob(child_data, node.is_root())
                feature_dict[node_id] = feature
                if node.is_root():
                    final_log_score = log_score
        return final_log_score

def get_tree_likelihood_reward(sequences, newick_tree_string):
    """
    Calculates the log-likelihood of a tree, which serves as the reward.
    """
    print("--- Part 3: Calculating Likelihood as Reward Signal ---")
    cfg = MockCfgNode({'ENV': {'REWARD': {'C': 0, 'SCALE': 1.0}, 'SEQUENCE_TYPE': 'DNA', 'EVOLUTION_MODEL': 'JC69'}})
    env = PhylogenticTreeEnv(cfg, sequences)
    
    ete_tree = TreeNode(newick_tree_string)
    internal_node_counter = len(sequences)
    for node in ete_tree.traverse("preorder"):
        if not node.is_leaf() and not node.name:
            node.name = str(internal_node_counter)
            internal_node_counter += 1
    
    log_score_tensor = env.compute_tree_log_score(ete_tree)
    reward = log_score_tensor.item()
    print(f"✅ Calculated Log-Likelihood (Reward): {reward:.4f}\n")
    return reward

# --- Main Execution ---

if __name__ == '__main__':
    # --- Setup ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}\n")
    
    # Use the example sequences and tree from your likelihood script
    dna_sequences = ["AGAACT", "AGATGT", "CGAACT", "CGATGT"]
    newick_tree = "((0:0.1, 1:0.1):0.2, (2:0.1, 3:0.1):0.2);"
    
    # --- Part 1: Generate Sequence Embeddings ---
    dnabert_model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True).to(device)
    dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
    
    leaf_node_embeddings = get_dna_embeddings(dna_sequences, dnabert_model, dnabert_tokenizer, device)

    # --- Part 2: Generate Tree Embedding ---
    print("--- Part 2: Generating Tree Vector Embedding ---")
    # Convert tree to graph format for the GNN
    graph_data = newick_to_graph_data(newick_tree, leaf_node_embeddings.to('cpu'))
    graph_data = graph_data.to(device) # Move graph data to the selected device
    graph_data.batch = torch.zeros(graph_data.num_nodes, dtype=torch.long).to(device) # Add batch vector

    # Initialize and run the TreeGNN model
    tree_gnn_model = TreeGNN(input_dim=leaf_node_embeddings.shape[1]).to(device)
    tree_vector_embedding = tree_gnn_model(graph_data)
    
    print("✅ Tree vector embedding generated.")
    print(f"   Shape: {tree_vector_embedding.shape}")
    print(f"   Vector: {tree_vector_embedding.detach().cpu().numpy()}\n")

    # --- Part 3: Get Reward Signal ---
    reward = get_tree_likelihood_reward(dna_sequences, newick_tree)

    print("--- Pipeline Complete ---")

Using device: cuda



Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Part 1: Generating DNA Sequence Embeddings ---
  Processing sequence 1/4...
  Processing sequence 2/4...
  Processing sequence 3/4...
  Processing sequence 4/4...
✅ Embeddings generated.

--- Part 2: Generating Tree Vector Embedding ---
✅ Tree vector embedding generated.
   Shape: torch.Size([1, 64])
   Vector: [[-0.09252521 -0.12469912 -0.11347823 -0.00265401  0.06644897  0.10902996
   0.01276676  0.06076688  0.06366131 -0.03113133 -0.06613523  0.07356571
   0.06505363  0.00751065  0.02706065  0.06118593 -0.08459453 -0.05586729
   0.12872213 -0.02909402 -0.071957   -0.00684902  0.0548784  -0.07346763
  -0.09493774 -0.06481383 -0.08488901  0.05296412  0.0423707   0.08750881
  -0.02373612  0.06513753 -0.01547156 -0.07664096 -0.01115914  0.11003086
  -0.08134507 -0.05298244 -0.08254381 -0.04761319 -0.0085787  -0.02501148
   0.11579509 -0.03822134  0.08364971 -0.06851368 -0.07431819 -0.11657837
  -0.01664385 -0.04384599 -0.05821709  0.05566616  0.03106859  0.06425551
   0.04265103  0.

In [10]:
import torch
import numpy as np
from torch import nn, optim
import torch.nn.functional as F
from torch.distributions import Categorical
from transformers import AutoTokenizer, AutoModel
from itertools import combinations
import math

# --- PART 1: REWARD CALCULATION (Adapted from your provided code) ---
# This section remains the same, as it's our "environment"

class MockCfgNode:
    def __init__(self, d=None):
        if d is None: d = {}
        for k, v in d.items(): setattr(self, k, MockCfgNode(v) if isinstance(v, dict) else v)

CHARACTERS_MAPS = {'DNA': {'A': [1.,0.,0.,0.], 'C': [0.,1.,0.,0.], 'G': [0.,0.,1.,0.], 'T': [0.,0.,0.,1.], 'N': [1.,1.,1.,1.]}}

class EvolutionModelTorch(nn.Module):
    def __init__(self, model_name='JC69'):
        super().__init__()
        self.Q = torch.tensor([[-3,1,1,1],[1,-3,1,1],[1,1,-3,1],[1,1,1,-3]], dtype=torch.float32) / 4.0
        self.pi = torch.tensor([0.25, 0.25, 0.25, 0.25], dtype=torch.float32)
    def get_transition_matrix(self, t):
        return torch.matrix_exp(self.Q * t.item())
    def compute_partial_prob(self, data, at_root):
        child_probs = [torch.matmul(p.squeeze(0), self.get_transition_matrix(bl).T) for p, bl in data]
        merged_likelihoods = child_probs[0]
        for i in range(1, len(child_probs)): merged_likelihoods *= child_probs[i]
        merged_likelihoods = merged_likelihoods.unsqueeze(0)
        if at_root:
            likelihood_per_site = torch.sum(self.pi * merged_likelihoods.squeeze(0), dim=1)
            # Add a small epsilon to prevent log(0)
            log_score = torch.sum(torch.log(likelihood_per_site + 1e-40))
            return merged_likelihoods, log_score
        return merged_likelihoods, None

class PhylogenticTreeEnv(nn.Module):
    def __init__(self, cfg, sequences, device):
        super(PhylogenticTreeEnv, self).__init__()
        self.device = device
        self.evolution_model = EvolutionModelTorch().to(device)
        self.chars_dict = CHARACTERS_MAPS[cfg.ENV.SEQUENCE_TYPE]
        seq_arrays = np.array([np.array([self.chars_dict[c] for c in s]) for s in sequences])
        self.seq_arrays = torch.nn.Parameter(torch.tensor(seq_arrays, dtype=torch.float32), requires_grad=False).to(device)
    def compute_tree_log_score(self, ete_tree):
        from ete3 import TreeNode # Import locally
        feature_dict = {}
        # Ensure all nodes have names
        internal_node_counter = self.seq_arrays.shape[0]
        for node in ete_tree.traverse("preorder"):
            if not node.is_leaf() and not node.name:
                node.name = str(internal_node_counter)
                internal_node_counter += 1
        
        for node in ete_tree.traverse("postorder"):
            node_id = int(node.name)
            if node.is_leaf():
                feature_dict[node_id] = self.seq_arrays[node_id].unsqueeze(0)
            else:
                child_data = [[feature_dict[int(c.name)], torch.tensor([c.dist], device=self.device)] for c in node.children]
                feature, log_score = self.evolution_model.compute_partial_prob(child_data, node.is_root())
                feature_dict[node_id] = feature
                if node.is_root(): return log_score
        return torch.tensor(float('-inf')) # Should not be reached

def get_reward(newick_string, sequences, env):
    from ete3 import TreeNode # Import locally
    if not newick_string or not newick_string.endswith(';'): return -math.inf, "Invalid Newick"
    try:
        tree = TreeNode(newick_string)
        # Check if the tree contains all leaves
        leaf_names_in_tree = set(tree.get_leaf_names())
        expected_leaf_names = set(str(i) for i in range(len(sequences)))
        if leaf_names_in_tree != expected_leaf_names:
            return -math.inf, "Tree missing leaves"
        ll = env.compute_tree_log_score(tree)
        return ll.item(), "OK"
    except Exception as e:
        return -math.inf, f"Reward Error: {e}"

# --- PART 2: THE REINFORCEMENT LEARNING AGENT ---

class TreeBuilder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=128):
        super(TreeBuilder, self).__init__()
        # Network to score pairs for merging
        self.pair_scorer = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        # Network to predict branch lengths
        self.branch_predictor = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2), # Predicts 2 branch lengths (child1, child2)
            nn.Sigmoid() # Scale branch lengths between 0 and 1
        )
        # Network to create the embedding for the new parent node
        self.parent_embedding_creator = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )

    def forward(self, node_embeddings):
        # Create all possible pairs of nodes
        indices = list(combinations(range(len(node_embeddings)), 2))
        emb1_indices, emb2_indices = zip(*indices)
        
        emb1 = node_embeddings[list(emb1_indices)]
        emb2 = node_embeddings[list(emb2_indices)]
        
        # Concatenate pairs
        concatenated_pairs = torch.cat((emb1, emb2), dim=1)
        
        # Get scores (logits) for each pair
        pair_logits = self.pair_scorer(concatenated_pairs).squeeze(-1)
        
        return pair_logits, indices

# --- PART 3: TRAINING & INFERENCE SCRIPT ---

def get_dna_embeddings(sequences, model, tokenizer, device):
    all_embeddings = []
    with torch.no_grad():
        for seq in sequences:
            inputs = tokenizer(seq, return_tensors='pt')["input_ids"].to(device)
            hidden_states = model(inputs)[0]
            all_embeddings.append(hidden_states.mean(dim=1))
    return torch.cat(all_embeddings, dim=0)

def run_training():
    # --- Setup ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    DNA_SEQUENCES = ["AGAACT", "AGATGT", "CGAACT", "CGATGT"]
    n_leaves = len(DNA_SEQUENCES)
    
    # Hyperparameters
    epochs = 5000
    learning_rate = 1e-4
    reward_baseline = -35.0 # Helps stabilize training. Start with a reasonable guess.

    # --- Pre-compute embeddings ---
    print("Pre-computing DNABERT embeddings...")
    dnabert_model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True).to(device)
    dnabert_model.eval()
    dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
    leaf_embeddings = get_dna_embeddings(DNA_SEQUENCES, dnabert_model, dnabert_tokenizer, device)
    embedding_dim = leaf_embeddings.shape[1]
    
    # --- Initialize models and optimizer ---
    agent = TreeBuilder(embedding_dim).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate)
    reward_env = PhylogenticTreeEnv(MockCfgNode({'ENV': {'SEQUENCE_TYPE': 'DNA'}}), DNA_SEQUENCES, device)
    
    print("--- Starting Reinforcement Learning Training ---\n")
    
    for epoch in range(epochs):
        # --- Run one episode (build one tree) ---
        
        # Reset for the new episode
        active_nodes = {i: str(i) for i in range(n_leaves)} # Map index to Newick string
        active_embeddings = leaf_embeddings.clone()
        
        all_log_probs = []

        # Build the tree step-by-step
        for _ in range(n_leaves - 1):
            pair_logits, pair_indices = agent(active_embeddings)
            
            # Convert logits to a probability distribution and sample an action
            action_distribution = Categorical(logits=pair_logits)
            chosen_pair_idx = action_distribution.sample()
            
            # Store the log probability of the chosen action
            all_log_probs.append(action_distribution.log_prob(chosen_pair_idx))
            
            # Get the indices of the nodes to merge
            idx1_in_active, idx2_in_active = pair_indices[chosen_pair_idx]
            
            # Get embeddings for the chosen pair
            emb1 = active_embeddings[idx1_in_active].unsqueeze(0)
            emb2 = active_embeddings[idx2_in_active].unsqueeze(0)
            
            # Predict branch lengths and create parent embedding
            with torch.no_grad():
                branch_lengths = agent.branch_predictor(torch.cat((emb1, emb2), dim=1)).squeeze()
                bl1, bl2 = branch_lengths[0].item(), branch_lengths[1].item()
                
                parent_embedding = agent.parent_embedding_creator(torch.cat((emb1, emb2), dim=1))

            # Update active nodes and Newick strings
            node1_newick = active_nodes.pop(list(active_nodes.keys())[idx1_in_active])
            # The second index shifts after the first pop
            node2_newick = active_nodes.pop(list(active_nodes.keys())[idx2_in_active-1])
            
            new_node_newick = f"({node1_newick}:{bl1:.4f},{node2_newick}:{bl2:.4f})"
            new_node_id = max(active_nodes.keys()) + 1 if active_nodes else 0
            active_nodes[new_node_id] = new_node_newick
            
            # Update active embeddings
            remaining_indices = [i for i in range(len(active_embeddings)) if i not in (idx1_in_active, idx2_in_active)]
            active_embeddings = torch.cat((active_embeddings[remaining_indices], parent_embedding), dim=0)

        # --- Get Reward ---
        final_newick = list(active_nodes.values())[0] + ";"
        reward, status = get_reward(final_newick, DNA_SEQUENCES, reward_env)

        # --- Calculate Loss and Update Policy ---
        if status == "OK":
            # REINFORCE loss: -log_prob * (reward - baseline)
            # We want to maximize reward, so we minimize the negative
            policy_loss = -torch.sum(torch.stack(all_log_probs)) * (reward - reward_baseline)
            
            optimizer.zero_grad()
            policy_loss.backward()
            optimizer.step()
            
            # Update baseline with moving average
            reward_baseline = 0.9 * reward_baseline + 0.1 * reward
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Reward: {reward:.4f}, Baseline: {reward_baseline:.4f}, Tree: {final_newick}")

    # --- Inference: Build one final tree greedily ---
    print("\n--- Training Finished. Generating final tree greedily ---")
    agent.eval()
    active_nodes = {i: str(i) for i in range(n_leaves)}
    active_embeddings = leaf_embeddings.clone()
    for _ in range(n_leaves - 1):
        with torch.no_grad():
            pair_logits, pair_indices = agent(active_embeddings)
            # Greedily choose the best action
            chosen_pair_idx = torch.argmax(pair_logits)
            idx1_in_active, idx2_in_active = pair_indices[chosen_pair_idx]
            
            emb1 = active_embeddings[idx1_in_active].unsqueeze(0)
            emb2 = active_embeddings[idx2_in_active].unsqueeze(0)
            branch_lengths = agent.branch_predictor(torch.cat((emb1, emb2), dim=1)).squeeze()
            bl1, bl2 = branch_lengths[0].item(), branch_lengths[1].item()
            parent_embedding = agent.parent_embedding_creator(torch.cat((emb1, emb2), dim=1))
        
        node1_newick = active_nodes.pop(list(active_nodes.keys())[idx1_in_active])
        node2_newick = active_nodes.pop(list(active_nodes.keys())[idx2_in_active-1])
        new_node_newick = f"({node1_newick}:{bl1:.4f},{node2_newick}:{bl2:.4f})"
        new_node_id = max(active_nodes.keys()) + 1 if active_nodes else 0
        active_nodes[new_node_id] = new_node_newick
        
        remaining_indices = [i for i in range(len(active_embeddings)) if i not in (idx1_in_active, idx2_in_active)]
        active_embeddings = torch.cat((active_embeddings[remaining_indices], parent_embedding), dim=0)
    
    final_newick = list(active_nodes.values())[0] + ";"
    final_reward, _ = get_reward(final_newick, DNA_SEQUENCES, reward_env)
    print(f"Final Greedy Tree: {final_newick}")
    print(f"Final Log-Likelihood: {final_reward:.4f}")


if __name__ == '__main__':
    #run_training()
    print("Training function is defined. Uncomment run_training() to execute.")

Training function is defined. Uncomment run_training() to execute.


In [20]:
import torch
import numpy as np
from torch import nn, optim
import torch.nn.functional as F
from torch.distributions import Categorical
from transformers import AutoTokenizer, AutoModel
from itertools import combinations
import math

# --- PART 1: REWARD CALCULATION (UNCHANGED, BUT WITH DEBUGGING ADDED) ---

# --- PART 1: REWARD CALCULATION (CORRECTED) ---

# ... (MockCfgNode and CHARACTERS_MAPS are the same) ...

class EvolutionModelTorch(nn.Module):
    """
    This class now uses register_buffer to ensure its tensors (Q matrix, pi)
    are moved to the correct device when .to(device) is called.
    """
    def __init__(self, model_name='JC69'):
        super().__init__()
        # Use register_buffer to make these part of the module's state
        q_matrix = torch.tensor([[-3,1,1,1],[1,-3,1,1],[1,1,-3,1],[1,1,1,-3]], dtype=torch.float32) / 4.0
        pi_vector = torch.tensor([0.25, 0.25, 0.25, 0.25], dtype=torch.float32)
        self.register_buffer('Q', q_matrix)
        self.register_buffer('pi', pi_vector)

    def get_transition_matrix(self, t):
        # self.Q is now on the correct device
        return torch.matrix_exp(self.Q * t.item())
        
    def compute_partial_prob(self, data, at_root):
        # The matmul will now work as both tensors are on the same device
        child_probs = [torch.matmul(p.squeeze(0), self.get_transition_matrix(bl).T) for p, bl in data]
        merged_likelihoods = child_probs[0]
        for i in range(1, len(child_probs)):
            merged_likelihoods *= child_probs[i]
        merged_likelihoods = merged_likelihoods.unsqueeze(0)
        
        if at_root:
            likelihood_per_site = torch.sum(self.pi * merged_likelihoods.squeeze(0), dim=1)
            log_score = torch.sum(torch.log(likelihood_per_site + 1e-40))
            return merged_likelihoods, log_score
            
        return merged_likelihoods, None

class PhylogenticTreeEnv(nn.Module):
    def __init__(self, cfg, sequences, device):
        super(PhylogenticTreeEnv, self).__init__()
        self.device = device # Store the device
        # This will now correctly move the Q matrix and pi to the specified device
        self.evolution_model = EvolutionModelTorch(cfg.ENV.EVOLUTION_MODEL).to(device)
        self.chars_dict = CHARACTERS_MAPS[cfg.ENV.SEQUENCE_TYPE]
        seq_arrays = np.array([np.array([self.chars_dict[c] for c in s]) for s in sequences])
        self.seq_arrays = torch.nn.Parameter(torch.tensor(seq_arrays, dtype=torch.float32), requires_grad=False).to(device)

    def compute_tree_log_score(self, ete_tree):
        from ete3 import TreeNode
        feature_dict = {}
        internal_node_counter = self.seq_arrays.shape[0]
        for node in ete_tree.traverse("preorder"):
            if not node.is_leaf() and not node.name:
                node.name = str(internal_node_counter)
                internal_node_counter += 1
        
        for node in ete_tree.traverse("postorder"):
            node_id = int(node.name)
            if node.is_leaf():
                feature_dict[node_id] = self.seq_arrays[node_id].unsqueeze(0)
            else:
                # **FIX APPLIED HERE**: Create the branch length tensor on the correct device.
                child_data = [[feature_dict[int(c.name)], torch.tensor([c.dist], device=self.device)] for c in node.children]
                feature, log_score = self.evolution_model.compute_partial_prob(child_data, node.is_root())
                feature_dict[node_id] = feature
                if node.is_root():
                    return log_score
        return torch.tensor(float('-inf'))

def get_reward(newick_string, sequences, env):
    """Calculates the log-likelihood reward, now with error printing."""
    from ete3 import TreeNode
    if not newick_string or not newick_string.endswith(';'):
        return -math.inf, "Invalid Newick format"
    try:
        tree = TreeNode(newick_string)
        leaf_names_in_tree = set(tree.get_leaf_names())
        expected_leaf_names = set(str(i) for i in range(len(sequences)))
        if leaf_names_in_tree != expected_leaf_names:
            return -math.inf, f"Tree missing leaves. Expected {expected_leaf_names}, got {leaf_names_in_tree}"
        
        ll = env.compute_tree_log_score(tree)
        
        # Check for non-finite rewards
        if not math.isfinite(ll.item()):
             return -math.inf, "Log-likelihood is not finite"

        return ll.item(), "OK"
    except Exception as e:
        # **ADDED**: Print the specific error for easier debugging.
        print(f"[Debug] Reward Error: {e} for tree: {newick_string}") 
        return -math.inf, f"Reward Error: {e}"

# ... (The TreeBuilder class and get_dna_embeddings function are the same) ...


# --- PART 3: TRAINING & INFERENCE SCRIPT (WITH CORRECTIONS) ---

def run_training(DNA_SEQUENCES=None):
    # --- Setup ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if DNA_SEQUENCES is None:
        DNA_SEQUENCES = ["AGAACT", "AGATGT", "CGAACT", "CGATGT"]
    n_leaves = len(DNA_SEQUENCES)
    
    # Hyperparameters
    epochs = 5000
    learning_rate = 1e-4
    # **MODIFIED**: Initialize baseline as None. We'll set it to the first valid reward.
    reward_baseline = None 

    # --- Pre-compute embeddings ---
    print("Pre-computing DNABERT embeddings...")
    dnabert_model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True).to(device)
    dnabert_model.eval()
    dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
    leaf_embeddings = get_dna_embeddings(DNA_SEQUENCES, dnabert_model, dnabert_tokenizer, device)
    embedding_dim = leaf_embeddings.shape[1]
    
    # --- Initialize models and optimizer ---
    agent = TreeBuilder(embedding_dim).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate)
    cfg = MockCfgNode({
        'ENV': {
            'REWARD': {'C': 0, 'SCALE': 1.0},
            'SEQUENCE_TYPE': 'DNA',
            'EVOLUTION_MODEL': 'JC69'
        }
    })
    reward_env = PhylogenticTreeEnv(cfg, DNA_SEQUENCES, device)
    
    print("--- Starting Reinforcement Learning Training ---\n")
    
    for epoch in range(epochs):
        agent.train() # Set agent to training mode
        active_nodes = {i: str(i) for i in range(n_leaves)}
        active_embeddings = leaf_embeddings.clone()
        all_log_probs = []

        for _ in range(n_leaves - 1):
            pair_logits, pair_indices = agent(active_embeddings)
            action_distribution = Categorical(logits=pair_logits)
            chosen_pair_idx = action_distribution.sample()
            all_log_probs.append(action_distribution.log_prob(chosen_pair_idx))
            
            idx1_in_active, idx2_in_active = pair_indices[chosen_pair_idx]
            
            emb1 = active_embeddings[idx1_in_active].unsqueeze(0)
            emb2 = active_embeddings[idx2_in_active].unsqueeze(0)
            
            # The branch predictor is now used inside the training loop to apply scaling
            branch_lengths_raw = agent.branch_predictor(torch.cat((emb1, emb2), dim=1)).squeeze()
            
            # **FIX APPLIED HERE**: Rescale and add a minimum value (epsilon)
            # This prevents branch lengths from being too close to zero.
            bl1 = branch_lengths_raw[0].item() * 0.5 + 0.01 
            bl2 = branch_lengths_raw[1].item() * 0.5 + 0.01

            parent_embedding = agent.parent_embedding_creator(torch.cat((emb1, emb2), dim=1))

            node1_newick = active_nodes.pop(list(active_nodes.keys())[idx1_in_active])
            node2_newick = active_nodes.pop(list(active_nodes.keys())[idx2_in_active-1 if idx2_in_active > idx1_in_active else idx2_in_active])
            
            new_node_newick = f"({node1_newick}:{bl1:.4f},{node2_newick}:{bl2:.4f})"
            new_node_id = max(active_nodes.keys()) + 1 if active_nodes else 0
            active_nodes[new_node_id] = new_node_newick
            
            remaining_indices = sorted([i for i in range(len(active_embeddings)) if i not in (idx1_in_active, idx2_in_active)])
            active_embeddings = torch.cat((active_embeddings[remaining_indices], parent_embedding), dim=0)

        final_newick = list(active_nodes.values())[0] + ";"
        reward, status = get_reward(final_newick, DNA_SEQUENCES, reward_env)

        if status == "OK":
            # **MODIFIED**: Smart baseline initialization
            if reward_baseline is None:
                reward_baseline = reward
            
            policy_loss = -torch.sum(torch.stack(all_log_probs)) * (reward - reward_baseline)
            
            optimizer.zero_grad()
            policy_loss.backward()
            optimizer.step()
            
            reward_baseline = 0.95 * reward_baseline + 0.05 * reward
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Reward: {reward:.4f}, Baseline: {reward_baseline or -1:.4f}")

    # ... (Inference part remains the same, but also needs the branch length fix) ...
    print("\n--- Training Finished. Generating final tree greedily ---")
    agent.eval()
    active_nodes = {i: str(i) for i in range(n_leaves)}
    active_embeddings = leaf_embeddings.clone()
    for _ in range(n_leaves - 1):
        with torch.no_grad():
            pair_logits, pair_indices = agent(active_embeddings)
            chosen_pair_idx = torch.argmax(pair_logits)
            idx1_in_active, idx2_in_active = pair_indices[chosen_pair_idx]
            
            emb1 = active_embeddings[idx1_in_active].unsqueeze(0)
            emb2 = active_embeddings[idx2_in_active].unsqueeze(0)
            branch_lengths_raw = agent.branch_predictor(torch.cat((emb1, emb2), dim=1)).squeeze()
            
            # **FIX APPLIED HERE TOO**
            bl1 = branch_lengths_raw[0].item() * 0.5 + 0.01
            bl2 = branch_lengths_raw[1].item() * 0.5 + 0.01

            parent_embedding = agent.parent_embedding_creator(torch.cat((emb1, emb2), dim=1))
        
        node1_newick = active_nodes.pop(list(active_nodes.keys())[idx1_in_active])
        node2_newick = active_nodes.pop(list(active_nodes.keys())[idx2_in_active-1 if idx2_in_active > idx1_in_active else idx2_in_active])
        new_node_newick = f"({node1_newick}:{bl1:.4f},{node2_newick}:{bl2:.4f})"
        new_node_id = max(active_nodes.keys()) + 1 if active_nodes else 0
        active_nodes[new_node_id] = new_node_newick
        
        remaining_indices = sorted([i for i in range(len(active_embeddings)) if i not in (idx1_in_active, idx2_in_active)])
        active_embeddings = torch.cat((active_embeddings[remaining_indices], parent_embedding), dim=0)
    
    final_newick = list(active_nodes.values())[0] + ";"
    final_reward, _ = get_reward(final_newick, DNA_SEQUENCES, reward_env)
    print(f"Final Greedy Tree: {final_newick}")
    print(f"Final Log-Likelihood: {final_reward:.4f}")


if __name__ == '__main__':
    dnaseq = [dnadata[x] for x in dnakeys]
    print("--- Running the complete training and inference pipeline ---")
    print("length of DNA sequences:", len(dnaseq))
    print("Running training with DNA sequences:", dnaseq)
    run_training(DNA_SEQUENCES= dnaseq)

--- Running the complete training and inference pipeline ---
length of DNA sequences: 27
Running training with DNA sequences: ['--CCTGGTTGATCCTGCCAGTAGCATA-GCTTGTCTCAAAGATTAAGCCATGCATGTCTAAGTACACACGGCCGGTACAGTGAAACTGCGAATGGCTCATTAAATCAGTTATGGTTCCTT--GTCGCTCCAACCGT---TACTTGGATAACTGTGGT--TTCTA-AGCTAATACATGCCGACGAGCGCTGACCT-C-----------GGGGAT-CGTG-ATTTATCAGACCAAAACCAACGGGCTCGCCCGGCC----------------------------------------------------------------------------------------------GCT-TGGTGACTCTAGATAACC-CGGGCCGATCGCA-GC-CC-CGTGGCGGCGACGACGCATTCGAATGTCT-CCCTATCAACTTTC-ATGGTACTTTCTGTGCCTACCATGGTGACC-CGGGTA-CGGGGA-TCA-GGTTCGATTC-GG-GAGGGA-CCTGAGAAACGGCTACCACATCC-AGGA-GGCAGCA-GCGCG---ATTACCCACTCCCGAC--GGGGA--TAGT-AC-AAAAATAACAATACAGGACTCT---GAGGCCCTGT-ATTGGAATGAGTACACTTTAAATCCTTTAACGAGGA-C-ATTGGAGGGC-AG------------------------------------------------------------------GC-CG-AG---GA-C-TG----------GGATCGAGC--G--GTCCGCCGCGAG-CGACGTACCGCC-GTCCC--GCCCCC-G--TCTCGGCGC-CCCTTGATGCTCTT-AC--AGTGTCCTG--GG-GTCCGA

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: '-'