In [1]:
import onnxruntime_genai as og
from difflib import SequenceMatcher

def calculate_sequence_similarity(seq1, seq2):
    """Calculate the similarity between two sequences using SequenceMatcher."""
    return SequenceMatcher(None, seq1, seq2).ratio()

def calculate_token_accuracy(predicted_tokens, expected_tokens):
    """Calculate token-level accuracy by comparing the predicted and expected tokens."""
    correct = sum(p == e for p, e in zip(predicted_tokens, expected_tokens))
    total = len(expected_tokens)
    accuracy = correct / total
    return accuracy

def evaluate_model_without_logits(model, tokenizer, prompt, answers, expected_tokens):
    evaluation_results = []
    
    # Tokenize the prompt
    prompt_tokens = tokenizer.encode(prompt)
    
    for i, answer in enumerate(answers):
        # Combine prompt with each answer
        full_input = prompt + ' ' + answer
        input_tokens = tokenizer.encode(full_input)
        
        # Pass input tokens to the model
        params = og.GeneratorParams(model)
        params.input_ids = input_tokens
        
        # Get the output from the model
        output = model.generate(params)
        
        # Predicted token IDs from the model (assuming output is token IDs)
        token_predictions = output[0]
        
        # Token-level accuracy
        token_accuracy = calculate_token_accuracy(token_predictions, expected_tokens[i])
        
        # Sequence similarity using edit distance
        sequence_similarity = calculate_sequence_similarity(token_predictions, expected_tokens[i])
        
        # Store results
        evaluation_results.append({
            'answer': answer,
            'token_accuracy': token_accuracy,
            'sequence_similarity': sequence_similarity,
            'predicted_tokens': token_predictions
        })
    
    return evaluation_results

# Example usage
model = og.Model('onnx-built-models/phi-3-mini-4k-instruct-onnx-cpu')
tokenizer = og.Tokenizer(model)

prompt = "What is the capital of France?"
answers = ["A. Paris", "B. London", "C. Rome", "D. Madrid"]

# Expected tokens (manually obtained or from a known correct response)
expected_tokens = [
    tokenizer.encode("A. Paris"),
    tokenizer.encode("B. London"),
    tokenizer.encode("C. Rome"),
    tokenizer.encode("D. Madrid")
]

evaluation_results = evaluate_model_without_logits(model, tokenizer, prompt, answers, expected_tokens)

for result in evaluation_results:
    print(f"Answer: {result['answer']}")
    print(f"Token Accuracy: {result['token_accuracy'] * 100:.2f}%")
    print(f"Sequence Similarity: {result['sequence_similarity'] * 100:.2f}%")
    print(f"Predicted Tokens: {result['predicted_tokens']}")
    print()


Answer: A. Paris
Token Accuracy: 25.00%
Sequence Similarity: 32.00%
Predicted Tokens: [1, 1724, 338, 278, 7483, 310, 3444, 29973, 319, 29889, 3681, 350, 29889, 4517, 315, 29889, 5115, 360, 29889, 9669, 32000]

Answer: B. London
Token Accuracy: 25.00%
Sequence Similarity: 0.27%
Predicted Tokens: [1, 1724, 338, 278, 7483, 310, 3444, 29973, 350, 29889, 4517, 315, 29889, 3681, 360, 29889, 5115, 382, 29889, 9669, 13, 13, 8809, 436, 310, 278, 1494, 338, 451, 263, 7601, 2927, 29973, 319, 29889, 4367, 350, 29889, 7646, 315, 29889, 10924, 360, 29889, 26048, 382, 29889, 612, 4743, 13, 13, 5618, 338, 278, 10150, 286, 4850, 284, 297, 278, 3186, 29973, 319, 29889, 11715, 8317, 561, 424, 350, 29889, 10924, 806, 744, 315, 29889, 402, 3055, 17615, 360, 29889, 2043, 279, 23606, 382, 29889, 4406, 424, 349, 5863, 13, 13, 5618, 338, 278, 1045, 6504, 1298, 310, 4094, 472, 7205, 3233, 29973, 319, 29889, 29871, 29896, 29900, 29900, 30073, 29907, 350, 29889, 29871, 29929, 29900, 30073, 1783, 29889, 29871, 299