In [2]:
import onnxruntime_genai as og
from difflib import SequenceMatcher


In [3]:

def calculate_sequence_similarity(seq1, seq2):
    """Calculate the similarity between two sequences using SequenceMatcher."""
    return SequenceMatcher(None, seq1, seq2).ratio()

def calculate_token_accuracy(predicted_tokens, expected_tokens):
    """Calculate token-level accuracy by comparing the predicted and expected tokens."""
    correct = sum(p == e for p, e in zip(predicted_tokens, expected_tokens))
    total = len(expected_tokens)
    accuracy = correct / total
    return accuracy

def evaluate_model_without_logits(model, tokenizer, prompt, answers, expected_tokens):
    evaluation_results = []
    
    # Tokenize the prompt
    prompt_tokens = tokenizer.encode(prompt)
    
    for i, answer in enumerate(answers):
        # Combine prompt with each answer
        full_input = prompt + ' ' + answer
        input_tokens = tokenizer.encode(full_input)
        
        # Pass input tokens to the model
        params = og.GeneratorParams(model)
        params.input_ids = input_tokens
        
        # Get the output from the model
        output = model.generate(params)
        
        # Predicted token IDs from the model (assuming output is token IDs)
        token_predictions = output[0]
        
        # Token-level accuracy
        token_accuracy = calculate_token_accuracy(token_predictions, expected_tokens[i])
        
        # Sequence similarity using edit distance
        sequence_similarity = calculate_sequence_similarity(token_predictions, expected_tokens[i])
        
        # Store results
        evaluation_results.append({
            'answer': answer,
            'token_accuracy': token_accuracy,
            'sequence_similarity': sequence_similarity,
            'predicted_tokens': token_predictions
        })
    
    return evaluation_results


In [4]:

# Example usage
model = og.Model('onnx-built-models/phi-3-mini-4k-instruct-onnx-cpu')
tokenizer = og.Tokenizer(model)


In [None]:
%%time

prompt = "What is the capital of France?"
answers = ["A. Paris", "B. London", "C. Rome", "D. Madrid"]

# Expected tokens (manually obtained or from a known correct response)
expected_tokens = [
    tokenizer.encode("A. Paris"),
    tokenizer.encode("B. London"),
    tokenizer.encode("C. Rome"),
    tokenizer.encode("D. Madrid")
]

evaluation_results = evaluate_model_without_logits(model, tokenizer, prompt, answers, expected_tokens)

for result in evaluation_results:
    print(f"Answer: {result['answer']}")
    print(f"Token Accuracy: {result['token_accuracy'] * 100:.2f}%")
    print(f"Sequence Similarity: {result['sequence_similarity'] * 100:.2f}%")
    print(f"Predicted Tokens: {result['predicted_tokens']}")
    print()
