# Wordle

The best first word to guess for Wordle is .

- https://www.nytimes.com/games/wordle/index.html

In [None]:
import lzma
from collections import Counter
from collections import namedtuple
from collections import defaultdict
import math
from functools import cache
import random
from tqdm.auto import tqdm
from enum import IntEnum

In [None]:
def load_word_list():
    with lzma.open('5-gram.txt.xz', mode='rt') as f:
        words = [line.strip() for line in f]
    return words

words = load_word_list()
print(len(words))

In [None]:
class ScoreState(IntEnum):
    YELLOW = 1
    GREEN = 2
    GRAY = 3

ScoreInfo = namedtuple('ScoreInfo', 'letter position state')    

def compute_score(guess, secret):
    N = len(secret)
    scores = []

    secret_pos = defaultdict(set)
    for idx, c in enumerate(secret):
        secret_pos[c].add(idx)
    guess_pos = defaultdict(set)
    for idx, c in enumerate(guess):
        guess_pos[c].add(idx)
    for c in guess_pos:
        if c in secret_pos:
            # Find greens.
            common = guess_pos[c].intersection(secret_pos[c])
            for pos in common:
                scores.append(ScoreInfo(c, pos, ScoreState.GREEN))
            leftover_secret = secret_pos[c].difference(common)
            leftover_guess = sorted(list(guess_pos[c].difference(common)))
            N = len(leftover_secret)
            for pos in leftover_guess[:N]:
                scores.append(ScoreInfo(c, pos, ScoreState.YELLOW))
            for pos in leftover_guess[N:]:
                scores.append(ScoreInfo(c, -1, ScoreState.GRAY))
        else:
            for pos in guess_pos[c]:
                scores.append(ScoreInfo(c, -1, ScoreState.GRAY))
    return tuple(sorted(scores))

def format_score(score):
    return tuple(sorted(score))
    
def apply_conditions(words, conditions):
    for guess, score in conditions.items():
        score = format_score(score)
        words = [w for w in words if compute_score(guess, w) == score]
    return words

def compute_entropy(probs):
    return -sum(p*math.log(p, 2) for p in probs)

def compute_guess(words):
    best_word = None
    best_gain = 0
    N = len(words)
    if N == 1:
        return words[0]
    for guess in tqdm(words):
        counts = Counter([compute_score(guess, w) for w in words])
        entropy = compute_entropy([v/N for v in counts.values()])
        if entropy > best_gain:
            best_gain = entropy
            best_word = guess
    return best_word    

In [None]:
assert compute_score('arets', 'tract') == format_score([('a', 0, ScoreState.YELLOW), 
                                          ('r', 1, ScoreState.GREEN),
                                          ('e', -1, ScoreState.GRAY),
                                          ('t', 3, ScoreState.YELLOW),
                                          ('s', -1, ScoreState.GRAY)])
assert compute_score('rears', 'tract') == format_score([('r', 0, ScoreState.YELLOW), 
                                          ('e', -1, ScoreState.GRAY),
                                          ('a', 2, ScoreState.GREEN),
                                          ('r', -1, ScoreState.GRAY),
                                          ('s', -1, ScoreState.GRAY)])
assert compute_score('krait', 'tract') == format_score([('k', -1, ScoreState.GRAY), 
                                          ('r', 1, ScoreState.GREEN),
                                          ('a', 2, ScoreState.GREEN),
                                          ('i', -1, ScoreState.GRAY),
                                          ('t', 4, ScoreState.GREEN)])
assert compute_score('pratt', 'tract') == format_score([('p', -1, ScoreState.GRAY), 
                                          ('r', 1, ScoreState.GREEN),
                                          ('a', 2, ScoreState.GREEN),
                                          ('t', 3, ScoreState.YELLOW),
                                          ('t', 4, ScoreState.GREEN)])
assert compute_score('tract', 'tract') == format_score([('t', 0, ScoreState.GREEN), 
                                          ('r', 1, ScoreState.GREEN),
                                          ('a', 2, ScoreState.GREEN),
                                          ('c', 3, ScoreState.GREEN),
                                          ('t', 4, ScoreState.GREEN)])

In [None]:
%%time
# Best first word to guess.
words = load_word_list()
conditions = {
}
best_first_guess = compute_guess(words)
print(best_first_guess)

In [None]:
%%time
# Guess word
words = load_word_list()
conditions = {
    'arets': [('a', -1, ScoreState.GRAY), 
              ('r', -1, ScoreState.GRAY),
              ('e', -1, ScoreState.GRAY),
              ('t', -1, ScoreState.GRAY),
              ('s', -1, ScoreState.GRAY)],
    'colin': [('c', -1, ScoreState.GRAY), 
              ('o', 1, ScoreState.YELLOW),
              ('l', 2, ScoreState.YELLOW),
              ('i', -1, ScoreState.GRAY),
              ('n', -1, ScoreState.GRAY)],
    'bloop': [('b', -1, ScoreState.GRAY), 
              ('l', 1, ScoreState.GREEN),
              ('o', 2, ScoreState.GREEN),
              ('o', 3, ScoreState.GREEN),
              ('p', -1, ScoreState.GRAY)]    
    
}
words = apply_conditions(words, conditions)
print(compute_guess(words))

In [None]:
%%time
# Guess word
words = load_word_list()
conditions = {
    'arets': [('a', -1, ScoreState.GRAY), 
              ('r', -1, ScoreState.GRAY),
              ('e', -1, ScoreState.GRAY),
              ('t', -1, ScoreState.GRAY),
              ('s', -1, ScoreState.GRAY)],
    'colin': [('c', -1, ScoreState.GRAY), 
              ('o', 1, ScoreState.YELLOW),
              ('l', 2, ScoreState.YELLOW),
              ('i', -1, ScoreState.GRAY),
              ('n', -1, ScoreState.GRAY)],
    'bloop': [('b', -1, ScoreState.GRAY), 
              ('l', 1, ScoreState.GREEN),
              ('o', 2, ScoreState.GREEN),
              ('o', 3, ScoreState.GREEN),
              ('p', -1, ScoreState.GRAY)]    
    
}
words = apply_conditions(words, conditions)
print(compute_guess(words))