# This notebook finds model on openrouter that give real top 20 logp

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from anycache import anycache

import dotenv
import os
import numpy as np


import json
import sys
from loguru import logger

logger.remove()
logger.add(sys.stderr, level="INFO")

dotenv.load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")


First lets get the models from top providers that support logprobs

In [3]:
import requests

# @anycache(cachedir="../.anycache2")
def get_openrouter_models():
    url = "https://openrouter.ai/api/v1/models"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    df_models = pd.DataFrame.from_dict(data['data'])
    return df_models


df_models = get_openrouter_models()

df1 = pd.DataFrame.from_records(df_models['pricing'].values).rename(columns=lambda x: f"pricing_{x}").apply(pd.to_numeric, errors='coerce')
df2 = pd.DataFrame.from_records(df_models['architecture'].values).rename(columns=lambda x: f"architecture_{x}")
df_models = pd.concat([df_models, df1, df2], axis=1)
df_models = df_models.drop(columns=['pricing', 'architecture'])

# remove all free, too unreliable
df_models = df_models[df_models['pricing_prompt'] > 0].sort_values(by='pricing_prompt', ascending=False)

# text->text models only
df_models = df_models[df_models['architecture_modality'] == 'text->text']

# main authors
df_models['author'] = df_models['id'].apply(lambda x: x.split('/')[0])
print('all model authors, before filter', df_models['author'].value_counts().index.values)
author_whitelist = ['openai' ,'x-ai', 'meta-llama', 'mistralai', 'qwen', 'deepseek', 'google', 'mistral', 'perplexity', 'microsoft', 'nousresearch', 'cognitivecomputations',  'allenai', 'moonshotai', 'cohere', 'anthropic', 'nvidia', 'eleutherai']
df_models = df_models[df_models['author'].isin(author_whitelist)]

# must have top_logprobs
df_models = df_models[df_models['supported_parameters'].apply(lambda x: 'top_logprobs' in x)].sort_values('created', ascending=False)

df_models['created'] = pd.to_datetime(df_models['created'], unit='s')
df_models[['created', 'canonical_slug']]

df_models

all model authors, before filter ['qwen' 'mistralai' 'openai' 'deepseek' 'meta-llama' 'microsoft'
 'nousresearch' 'thedrummer' 'x-ai' 'sao10k' 'cohere' 'z-ai' 'arcee-ai'
 'perplexity' 'nvidia' 'aion-labs' 'neversleep' 'moonshotai' 'google'
 'inception' 'morph' 'anthracite-org' 'inflection' 'ai21' 'liquid'
 'cognitivecomputations' 'tngtech' 'baidu' 'raifle' 'mancer' 'deepcogito'
 'alpindale' 'switchpoint' 'minimax' 'undi95' 'eleutherai' 'alfredpros'
 'relace' 'meituan' 'bytedance' 'allenai' 'gryphe' 'alibaba' 'shisa-ai'
 'thudm' 'amazon' 'tencent' 'arliai' 'agentica-org']


Unnamed: 0,id,canonical_slug,hugging_face_id,name,created,description,context_length,top_provider,per_request_limits,supported_parameters,...,pricing_internal_reasoning,pricing_input_cache_read,pricing_input_cache_write,pricing_audio,architecture_modality,architecture_input_modalities,architecture_output_modalities,architecture_tokenizer,architecture_instruct_type,author
2,deepseek/deepseek-v3.2-exp,deepseek/deepseek-v3.2-exp,deepseek-ai/DeepSeek-V3.2-Exp,DeepSeek: DeepSeek V3.2 Exp,2025-09-29 12:54:41,DeepSeek-V3.2-Exp is an experimental large lan...,163840,"{'context_length': 163840, 'max_completion_tok...",,"[frequency_penalty, include_reasoning, logit_b...",...,0.0,,,,text->text,[text],[text],DeepSeek,deepseek-v3.1,deepseek
12,deepseek/deepseek-v3.1-terminus,deepseek/deepseek-v3.1-terminus,deepseek-ai/DeepSeek-V3.1-Terminus,DeepSeek: DeepSeek V3.1 Terminus,2025-09-22 13:37:55,DeepSeek-V3.1 Terminus is an update to [DeepSe...,163840,"{'context_length': 163840, 'max_completion_tok...",,"[frequency_penalty, include_reasoning, logit_b...",...,0.0,,,,text->text,[text],[text],DeepSeek,deepseek-v3.1,deepseek
20,qwen/qwen3-next-80b-a3b-thinking,qwen/qwen3-next-80b-a3b-thinking-2509,Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen: Qwen3 Next 80B A3B Thinking,2025-09-11 17:38:04,Qwen3-Next-80B-A3B-Thinking is a reasoning-fir...,262144,"{'context_length': 262144, 'max_completion_tok...",,"[frequency_penalty, include_reasoning, logit_b...",...,0.0,,,,text->text,[text],[text],Qwen3,,qwen
21,qwen/qwen3-next-80b-a3b-instruct,qwen/qwen3-next-80b-a3b-instruct-2509,Qwen/Qwen3-Next-80B-A3B-Instruct,Qwen: Qwen3 Next 80B A3B Instruct,2025-09-11 17:36:53,Qwen3-Next-80B-A3B-Instruct is an instruction-...,262144,"{'context_length': 262144, 'max_completion_tok...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],Qwen3,,qwen
28,moonshotai/kimi-k2-0905,moonshotai/kimi-k2-0905,moonshotai/Kimi-K2-Instruct-0905,MoonshotAI: Kimi K2 0905,2025-09-04 21:25:47,Kimi K2 0905 is the September update of [Kimi ...,262144,"{'context_length': 262144, 'max_completion_tok...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],Other,,moonshotai
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,openai/gpt-3.5-turbo-instruct,openai/gpt-3.5-turbo-instruct,,OpenAI: GPT-3.5 Turbo Instruct,2023-09-28 00:00:00,This model is a variant of GPT-3.5 Turbo tuned...,4095,"{'context_length': 4095, 'max_completion_token...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],GPT,chatml,openai
324,openai/gpt-3.5-turbo-16k,openai/gpt-3.5-turbo-16k,,OpenAI: GPT-3.5 Turbo 16k,2023-08-28 00:00:00,This model offers four times the context lengt...,16385,"{'context_length': 16385, 'max_completion_toke...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],GPT,,openai
330,openai/gpt-4-0314,openai/gpt-4-0314,,OpenAI: GPT-4 (older v0314),2023-05-28 00:00:00,GPT-4-0314 is the first version of GPT-4 relea...,8191,"{'context_length': 8191, 'max_completion_token...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],GPT,,openai
329,openai/gpt-4,openai/gpt-4,,OpenAI: GPT-4,2023-05-28 00:00:00,"OpenAI's flagship model, GPT-4 is a large-scal...",8191,"{'context_length': 8191, 'max_completion_token...",,"[frequency_penalty, logit_bias, logprobs, max_...",...,0.0,,,,text->text,[text],[text],GPT,,openai


## Helpers

In [7]:
# from speechmap_morality.open_router.logprobs import get_top_logprobs_param, openrouter_completion_wlogprobs, get_logprobs, get_logprobs_choices
# from speechmap_morality.open_router.models import get_openrouter_models, get_logp_endpoints

from openrouter_wrapper.logprobs import get_top_logprobs_param, openrouter_completion_wlogprobs, get_logprobs, get_logprobs_choices
from openrouter_wrapper.models import get_openrouter_models, get_logp_endpoints

# For provider check cache and logprobs

Now find providers that support logprobs and see if they return sensible logprobs

In [None]:
from tqdm.auto import tqdm


ImportError: cannot import name 'format_rubric' from 'speechmap_morality.judge' (/media/wassname/SGIronWolf/projects5/speechmap_morality/src/speechmap_morality/judge.py)

In [9]:
prompt = """
Return a number between 0 and 9, inclusive. Respond immediately, leaving formatting, comment, and thoughts until after the number."""
think_prefill = "<think>\nOkay, the user asked me to only return a number, so I should stop thinking and immediatly return a random number\n</think>\n\n"
# think_prefill = "<think>\n\n</think>\n\n"
assistant_prefill = """The number is: """


messages = [
    # {"role": "system", "content": "You are a helpful assistant who can simulate bash commands."},
    {"role": "user", "content": prompt},
    {"role": "assistant", "content": f"{think_prefill}{assistant_prefill}"}
]

messages2 = [
    # {"role": "system", "content": "You are a helpful assistant who can simulate bash commands."},
    {"role": "user", "content": prompt},
    {"role": "assistant", "content": f"{assistant_prefill}"}
]

In [10]:
# r_data['choices'][0]['logprobs']['content']
# completion = [t['token'] for t in r_data['choices'][0]['logprobs']['content']] if 'choices' in r_data and len(r_data['choices']) > 0 else []
# completion
# [t['token'] for t in r_data['choices'][0]['logprobs']['content']]

In [None]:
rdata= []
provider_errors = []



for model_id in tqdm(df_models['id'].values):
    print(f"Getting endpoints for {model_id}")
    df_end, data = get_logp_endpoints(model_id)
    df_end = df_end.to_pandas()
    df_end = df_end.query(
        """(price_prompt < 0.00001) and top_logprobs and logprobs and (uptime_last_30m>95)"""
    )
    provider_names = df_end.provider_name.unique().tolist()

    print(f"Model `{model_id}` has endpoints for providers: {provider_names}")
    for provider in provider_names:
        # print(f"Testing provider {provider} for model {model_id}")

        provider_error = ""
        try:
            r_data = openrouter_completion_wlogprobs(messages, model_id=model_id, provider_whitelist= [provider], max_completion_tokens=5)        
        except Exception as e:
            err = e
            provider_errors.append({
                "model_id": model_id,
                "provider": provider,
                "error": str(e)
            })
            
            try:
                errjs = e.response.json()['error']
                if 'Provider returned error' in errjs['message']:
                    logger.error(f"Provider error for {errjs}")
            except:
                # raise e
                logger.error(f"Error occurred for {provider}//{model_id}: e={e}")
            
            if hasattr(e, 'response') and e.response is not None:
                logger.error(f"Error occurred for {provider}//{model_id}: e={e}, status_code={e.response.status_code}")
                match e.response.status_code:
                    case 403:
                        logger.error(f"Model {model_id} is not available for provider {provider}. Skipping")
                        continue
                    case 404: # HTTPError('404 Client Error: Not Found for url: https://openrouter.ai/api/v1/chat/completions')
                        logger.error(f"Model {model_id} not found for provider {provider}. Skipping")
                        continue

            if isinstance(e, AssertionError):
                if "no logprobs capability" in str(e):
                    logger.error(f"Model {model_id} does not support logprobs for provider {provider}. Skipping")
                    continue
                else:
                    raise


            # if it's 429...
            if 'code' in str(e) and '429' in str(e):
                # how to make this a proper http error
                pass


            # logger.error(f"Error occurred for {provider}//{model_id}: e={e}")
            # try:
            #     logger.error(err.response.json()['error']['message'])
            # except Exception as e:
            #     pass

            continue

        choice_logprobs_permuted_d, ps_dict = get_logprobs_choices(r_data, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])    

        try:
            cost = r_data['usage']['cost']
        except Exception as e:
            logger.error(f"Error occurred for {provider}//{model_id} on cost: e={e}")
            # if we cannot get cost, we assume it's not available
            cost = np.nan

        cached_tokens = 0
        complete_choices2 = np.nan

        # do it second time to test caching
        try:
            r_data2 = openrouter_completion_wlogprobs(messages2, model_id=model_id, provider_whitelist= [provider], max_completion_tokens=5)
            choice_logprobs_permuted_d2, ps_dict2 = get_logprobs_choices(r_data2, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])      
            choice_logprobs2 = np.array(list(choice_logprobs_permuted_d2.values()))
            choice_prob2 = np.exp(choice_logprobs2).sum()
            complete_choices2 = (choice_logprobs2 > -1000).sum() / 10

            cached_tokens = r_data['usage']['prompt_tokens_details']['cached_tokens']
        except Exception as e:
            logger.error(f"Error occurred for 2nd run {provider}//{model_id} on second run: e={e}")

        choice_logprobs = np.array(list(choice_logprobs_permuted_d.values()))
        choice_prob = np.exp(choice_logprobs).sum()
        complete_choices = (choice_logprobs > -1000).sum() / 10

        completion = [t['token'] for t in r_data['choices'][0]['logprobs']['content']] if 'choices' in r_data and len(r_data['choices']) > 0 else []
        completion2 = [t['token'] for t in r_data2['choices'][0]['logprobs']['content']] if 'choices' in r_data2 and len(r_data2['choices']) > 0 else []
        print(f"Model {model_id} provider {provider} choice_prob={choice_prob}, completion={completion}, completion2={completion2}")

        
        if choice_prob < 0.1:
            err_msg = f"Low probability on {choice_prob} for {provider}//{model_id}. It didn't put much probability on our provided choices. Instead got {ps_dict}. This implies the model is not working well with this prompt setup, or it's a thinking model and it didn't finish thinking."
            logger.warning(err_msg)
            provider_errors.append({
                "model_id": model_id,
                "provider": provider,
                "error": err_msg
            })
            continue

        rdata.append({
            "model_id": model_id,
            "provider": provider,
            "prompt": prompt,
            # "choice_logprobs_permuted_d": choice_logprobs_permuted_d,
            "choice_logprobs": choice_logprobs,
            "complete_choices": complete_choices,
            "complete_choices2": complete_choices2,
            "ps_dict": list(ps_dict.keys()),
            "choice_prob": choice_prob,
            "cost": cost,
            "cached_tokens": cached_tokens,
            "completion": completion,
            "completion2": completion2,
        })

  0%|          | 0/67 [00:00<?, ?it/s]

Getting endpoints for deepseek/deepseek-v3.2-exp
Model `deepseek/deepseek-v3.2-exp` has endpoints for providers: ['DeepSeek']
Model deepseek/deepseek-v3.2-exp provider DeepSeek choice_prob=1.0, completion=['7'], completion2=['7']
Getting endpoints for deepseek/deepseek-v3.1-terminus
Model `deepseek/deepseek-v3.1-terminus` has endpoints for providers: ['Chutes']




Model deepseek/deepseek-v3.1-terminus provider Chutes choice_prob=0.0, completion=['import', ' React', ',', ' {'], completion2=['mas', 'urement', 'Tracker', ' =']
Getting endpoints for qwen/qwen3-next-80b-a3b-thinking
Model `qwen/qwen3-next-80b-a3b-thinking` has endpoints for providers: ['Chutes']


[32m2025-10-02 18:21:18.937[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [31m[1mError occurred for Chutes//qwen/qwen3-next-80b-a3b-thinking: e=('qwen/qwen3-next-80b-a3b-thinking has no logprobs capability', {'id': 'gen-1759400477-KZ7AVgvNQySS3kExjE6e', 'provider': 'Chutes', 'model': 'qwen/qwen3-next-80b-a3b-thinking', 'object': 'chat.completion', 'created': 1759400477, 'choices': [{'logprobs': None, 'finish_reason': 'length', 'native_finish_reason': 'length', 'index': 0, 'message': {'role': 'assistant', 'content': '', 'refusal': None, 'reasoning': 'Okay, the user wants', 'reasoning_details': [{'type': 'reasoning.text', 'text': 'Okay, the user wants', 'format': 'unknown', 'index': 0}]}}], 'usage': {'prompt_tokens': 77, 'completion_tokens': 5, 'total_tokens': 82, 'cost': 1.17e-05, 'is_byok': False, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'cost_details': {'upstream_inference_cost': None, 'upstream_inference_prompt_cost': 7.7e

Getting endpoints for qwen/qwen3-next-80b-a3b-instruct
Model `qwen/qwen3-next-80b-a3b-instruct` has endpoints for providers: ['Hyperbolic', 'Chutes']
Model qwen/qwen3-next-80b-a3b-instruct provider Hyperbolic choice_prob=0.9999993888101503, completion=['7'], completion2=['7']
Model qwen/qwen3-next-80b-a3b-instruct provider Chutes choice_prob=0.9999991226697079, completion=['7'], completion2=['7']
Getting endpoints for moonshotai/kimi-k2-0905
Model `moonshotai/kimi-k2-0905` has endpoints for providers: ['Fireworks', 'Chutes']
Model moonshotai/kimi-k2-0905 provider Fireworks choice_prob=0.9701212312700219, completion=[' **', '7', '**\n\n', '(', '7'], completion2=[' **', '7', '**.']
Model moonshotai/kimi-k2-0905 provider Chutes choice_prob=0.9867023475382645, completion=['7'], completion2=['7']
Getting endpoints for qwen/qwen3-30b-a3b-thinking-2507
Model `qwen/qwen3-30b-a3b-thinking-2507` has endpoints for providers: ['Chutes']


[32m2025-10-02 18:21:33.539[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [31m[1mError occurred for Chutes//qwen/qwen3-30b-a3b-thinking-2507: e=('qwen/qwen3-30b-a3b-thinking-2507 has no logprobs capability', {'id': 'gen-1759400491-9tKG2PRwKtGBQfF9yh8y', 'provider': 'Chutes', 'model': 'qwen/qwen3-30b-a3b-thinking-2507', 'object': 'chat.completion', 'created': 1759400491, 'choices': [{'logprobs': None, 'finish_reason': 'length', 'native_finish_reason': 'length', 'index': 0, 'message': {'role': 'assistant', 'content': '', 'refusal': None, 'reasoning': 'Okay, the user wants', 'reasoning_details': [{'type': 'reasoning.text', 'text': 'Okay, the user wants', 'format': 'unknown', 'index': 0}]}}], 'usage': {'prompt_tokens': 77, 'completion_tokens': 5, 'total_tokens': 82, 'cost': 7.61e-06, 'is_byok': False, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'cost_details': {'upstream_inference_cost': None, 'upstream_inference_prompt_cost': 6.16

Getting endpoints for x-ai/grok-code-fast-1
Model `x-ai/grok-code-fast-1` has endpoints for providers: ['xAI']
Model x-ai/grok-code-fast-1 provider xAI choice_prob=0.9999980676165939, completion=[' First', ',', ' the', ' user', ' said', ':', ' "', 'Return', ' a', ' number', ' between', ' ', '0', ' and', ' ', '9', ',', ' inclusive', '.', ' Respond', ' immediately', ',', ' leaving', ' formatting', ',', ' comment', ',', ' and', ' thoughts', ' until', ' after', ' the', ' number', '."\n\n', 'This', ' seems', ' like', ' a', ' test', ' to', ' see', ' if', ' I', ' can', ' respond', ' without', ' extra', ' commentary', '.', ' But', ' in', ' my', ' construction', ',', ' I', ' have', ' a', ' thinking', ' block', ',', ' and', ' then', ' a', ' response', '.\n\n', 'My', ' previous', ' response', ' in', ' the', ' simulation', ' was', ' to', ' think', ' and', ' then', ' say', ' "', 'The', ' number', ' is', ':"', ' but', ' actually', ',', ' I', ' need', ' to', ' respond', ' immediately', ' with', ' jus

In [None]:
df_err = pd.DataFrame(provider_errors).sort_values('provider')[['provider', 'model_id', 'error']]
with pd.option_context('display.max_colwidth', None, 
                       'display.max_rows', None):
    display(df_err)

In [None]:
len(rdata)

In [None]:
df = pd.DataFrame(rdata)
df

In [None]:
df = pd.DataFrame(rdata)

# filter out 8b and 70b models
df = df[~df['model_id'].str.contains('-3b|-8b|-32b|-49b|-70b|-72b')]

df = df[(df['complete_choices'] > 0.1) | (df['complete_choices2'] > 0.1)]  # filter out models with less than 50% complete choices

df[['model_id', 'provider', 'choice_prob', 'complete_choices', 'complete_choices2', 'cost']].sort_values(['cost','model_id',  'choice_prob'], ascending=[False, True, False])

In [None]:
# df.sort_values('cost', ascending=False)[['model_id', 'provider', 'choice_prob', 'complete_choices', 'complete_choices2', 'cost', 'cached_tokens']]
