In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv
import requests
from openai import OpenAI
import numpy as np
import httpx
from tqdm import tqdm
import re

# Load the .env file
load_dotenv()

# Access the variables
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
XAI_API_KEY = os.getenv("XAI_API_KEY")

In [None]:
def get_ai_action(prompt, model):
    # grok 4 has some specific differences that require streaming and a large timeout
    if 'grok' in model:
        return get_grok_action(prompt, model)
    
    # all other models
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        # "reasoning": {
        #     "effort": "high"  # Use high reasoning effort
        # }
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    
    try:
        msg = response.json()['choices'][0]['message']['content']
    except Exception as e:
        print('parsing request error:', e)
        print(response.json())
    return msg


# grok 4 can take 7 - 15 minutes sometimes
def get_grok_action(prompt, model):
    # print('grok model',model.split('/')[1])
    client = OpenAI(
    api_key= XAI_API_KEY,
    base_url="https://api.x.ai/v1",
    timeout=httpx.Timeout(3600.0)  # Timeout after 3600s for reasoning models
    )

    model
    stream = client.chat.completions.create(
        model=model.split('/')[1],
        messages=[
            {"role": "user", "content": prompt},
        ],
        stream=True  # Set streaming here
    )

    all_chunks = []
    for chunk in stream:
        chunk_text = chunk.choices[0].delta.content
        if chunk_text is not None:
            all_chunks.append(chunk.choices[0].delta.content)
        
    return ''.join(all_chunks)

def build_prompt(board, rack, invalid=[]):
    prompt = f'''
You are playing Scrabble and it is your turn. The rules are:
2 to 4 players try to score the most points by forming words on a 15x15 board.
There are 100 tiles: 98 letter tiles with points, and 2 blank wild tiles worth 0 points.

Letter point values:
0: blank
1: A, E, I, L, N, O, R, S, T, U
2: D, G
3: B, C, M, P
4: F, H, V, W, Y
5: K
8: J, X
10: Q, Z

Bonuses apply only the first time a tile is placed on them.
First player is the one who draws the letter closest to "A" (blank beats all). Tiles are then returned to the bag.
On a turn, a player must play a word.
Using all 7 tiles in one turn earns a 50 point bonus.

Scoring:
Valid words are those found in a standard dictionary or Official Scrabble Dictionary.
Invalid words include abbreviations, prefixes/suffixes, capitalized words, and words with hyphens or apostrophes.        

Current Board:
{board}

Your Letters: 
{rack}'''
    
    # track failed guesses
    invalid_str = ''
    if len(invalid):
        invalid_str = '\n\nInvalid Words:\n'+'\n\n'.join(invalid)

    end_prompt = '''
Special Squares: {"triple-word":["0-0","0-7","0-14","7-0","7-14","14-0","14-7","14-14"],"double-word":["1-1","2-2","3-3","4-4","1-13","2-12","3-11","4-10","13-1","12-2","11-3","10-4","13-13","12-12","11-11","10-10"],"triple-letter":["1-5","1-9","5-1","5-5","5-9","5-13","9-1","9-5","9-9","9-13","13-5","13-9"],"double-letter":["0-3","0-11","2-6","2-8","3-0","3-7","3-14","6-2","6-6","6-8","6-12","7-3","7-11","8-2","8-6","8-8","8-12","11-0","11-7","11-14","12-6","12-8","14-3","14-11"],"center":["7-7"]}
The board indexes as col,row with 0,0 at the top left corner and 7,7 at the center.

On Your Turn, You Can:
Play a Word:
Use 1 or more tiles from your rack to form a word in a straight line (horizontally or vertically).
Must connect to existing words, and any words created by those connections must be valid.
Score points for the word and any additional words formed.
    
Respond ONLY with valid JSON that describes your move.

THE JSON must have the fields:
- "word" (string, the word you wish to play)
- "reason" (string, 1-3 sentences max explaining why this word was chosen)

Example:
{"word": "dog", "I  wanted to connect to the double letter score on W."}
    '''

    # print(prompt+invalid_str+end_prompt)
    return prompt+invalid_str+end_prompt

def parse_word(msg):
    try:
        return json.loads(msg)
    except:
        # fallback plan

        # Regex to match JSON-like blocks between { and }
        pattern = r'\{.*?\}'

        matches = re.findall(pattern, msg)

        # Try parsing each match as JSON
        json_objects = []
        for m in matches:
            try:
                obj = json.loads(m)
                if 'word' in obj:
                    json_objects.append(obj)
            except json.JSONDecodeError:
                pass  # skip if it's not valid JSON
        
        if len(json_objects):
            return json_objects[-1]
        else:
            return {}

In [None]:
with open('dataset.json','rb') as file:
    dataset = json.loads(file.read())

In [None]:
models = [
    'moonshotai/kimi-k2',
    'google/gemini-2.0-flash-lite-001',
    'x-ai/grok-4',
    'anthropic/claude-sonnet-4',
    'google/gemini-2.5-flash',
    'deepseek/deepseek-chat-v3-0324',
    'openrouter/horizon-beta',
    'deepseek/deepseek-r1-0528:free',
    'openai/o3',
    'openai/gpt-4.1',
    'openai/gpt-oss-120b',
    'anthropic/claude-opus-4.1',
    'google/gemini-2.5-pro',
    'openai/gpt-4o',
    'openai/o4-mini',
    'openai/o4-mini-high',
    'x-ai/grok-3-mini',
    'x-ai/grok-3',
    'google/gemini-2.0-flash-001',
    'google/gemini-2.5-flash-lite-preview-06-17',
    'google/gemma-3n-e4b-it',
    'qwen/qwen3-30b-a3b-instruct-2507',
    'qwen/qwen3-235b-a22b-thinking-2507',
    'openai/gpt-5-nano',
    'openai/gpt-5-mini',
    'openai/gpt-5',
    'anthropic/claude-opus-4',
    'openai/gpt-5-chat']

In [None]:
# randomly select 3 boards
rounds = np.random.choice(len(dataset), 50)
models = ['x-ai/grok-3', 'x-ai/grok-4']
# models = ['anthropic/claude-sonnet-4','anthropic/claude-opus-4.1']

In [None]:
res = []

In [None]:
for round_num in tqdm(rounds):
    possible_words = pd.Series({x[0].upper():x[1] for x in dataset[round_num][2]})
    possible_words = possible_words.groupby(possible_words.index).max()

    if len(possible_words) == 0:
        continue

    for model in models:
        print(round_num, model, len(possible_words))
        
        retry_count = 0
        score = 0
        invalid_words = []
        
        while retry_count < 3:
            round_prompt = build_prompt(board=dataset[round_num][0], rack=dataset[round_num][1], invalid=invalid_words)
    
            try:
                retry_count += 1

                action = get_ai_action(round_prompt, model)
                chosen_word = parse_word(action).get('word',None).upper()

                if chosen_word not in possible_words.index:
                    invalid_words.append(chosen_word.upper())
                    continue
                
                score = possible_words.loc[chosen_word]
                
                if score > 0:
                    break

            except Exception as e:
                print(f'{model} retrying due to {e}')
                print(action)
                assert False

        res.append({
            'model': model,
            '# attempts': retry_count,
            'score': score if score > 0 else np.nan,
            'word': chosen_word.upper(),
            'num_possible_words': len(dataset[round_num][2]),
            'word_percentile': round(len(possible_words[possible_words <= score])/len(possible_words),3),
            'round_num': round_num,
            'attempts': invalid_words,
            'rack':dataset[round_num][1],
            'prompt': round_prompt,
        })

        if len(rounds) > 10:
            print('\t', model, score)

In [None]:
rounds[17:]

In [None]:
df = pd.DataFrame(res)
df.head(10)

In [None]:
sc = df.pivot(index='round_num', columns='model', values='score')
sc.describe().round(1)

sc.fillna('-')

In [None]:
df.groupby('model')[['# attempts','score','word_percentile','num_possible_words']].mean().sort_values('score',ascending=False)