In [28]:
import util

import numpy, pandas, scipy, scipy.special
from frozendict import frozendict

import collections, functools, itertools, math, typing


In [9]:
GUESSABLE_WORDS_FILE = './allowed_guesses.txt'

guessable_words = sorted(
    line.strip() for line in open(GUESSABLE_WORDS_FILE, 'r').readlines())

TARGET_LENGTH = len(guessable_words[0])

len(guessable_words), TARGET_LENGTH


(12953, 5)

In [10]:
UNIGRAM_FREQUENCIES_FILE = './useful_stats/unigram_freq.csv'

unigram_frequencies_df = pandas.read_csv(UNIGRAM_FREQUENCIES_FILE,
                                         keep_default_na=False)
guessable_words_frequencies_df = unigram_frequencies_df[
    unigram_frequencies_df['word'].isin(guessable_words)]

unigram_frequencies: frozendict[str, int] = frozendict(
    unigram_frequencies_df.set_index('word')['count'].to_dict())

guessable_words_frequencies_df


Unnamed: 0,word,count
35,about,1226734006
45,other,978481319
56,which,810514085
57,their,782849411
62,there,701170205
...,...,...
331059,sadis,12774
331869,maist,12738
331922,hully,12736
332433,intil,12721


In [11]:
class LetterClue(typing.NamedTuple):
    min_count: int
    bounded: bool
    in_positions: frozenset[int]
    not_in_positions: frozenset[int]


Clues = frozendict[str, LetterClue]


def judge(target: str, guess: str) -> Clues:
    clues: dict[str, LetterClue] = {}
    for letter in set(guess):
        min_count = min(guess.count(letter), target.count(letter))
        bounded = guess.count(letter) > target.count(letter)
        positions_in_guess = {i for i, l in enumerate(guess) if l == letter}
        positions_in_target = {i for i, l in enumerate(target) if l == letter}
        in_positions = positions_in_guess & positions_in_target
        if bounded and len(in_positions) == min_count:
            not_in_positions = set()
        else:
            not_in_positions = positions_in_guess - positions_in_target
        clues[letter] = LetterClue(min_count, bounded, frozenset(in_positions),
                                   frozenset(not_in_positions))
    return frozendict(clues)


def combine_clues(c1: Clues, c2: Clues) -> Clues:
    combined_clues = dict(c1)
    for letter, new_clues in c2.items():
        if letter not in combined_clues:
            combined_clues[letter] = new_clues
        else:
            old_clues = combined_clues[letter]
            min_count = max(old_clues.min_count, new_clues.min_count)
            bounded = old_clues.bounded or new_clues.bounded
            if bounded:
                assert old_clues.min_count == new_clues.min_count
            in_positions = old_clues.in_positions | new_clues.in_positions
            if bounded and len(in_positions) == min_count:
                not_in_positions = frozenset()
            else:
                not_in_positions = frozenset(old_clues.not_in_positions
                                             | new_clues.not_in_positions)
            combined_clues[letter] = LetterClue(min_count, bounded,
                                                in_positions, not_in_positions)
    return frozendict(combined_clues)


def is_word_fitting(clues: Clues, word: str, *, hard_mode=False) -> bool:
    for letter, clue in clues.items():
        if word.count(letter) < clue.min_count:
            return False
        if not all(word[i] == letter for i in clue.in_positions):
            return False
        if not hard_mode:
            if clue.bounded and word.count(letter) > clue.min_count:
                return False
            if any(word[i] == letter for i in clue.not_in_positions):
                return False
    return True


def gen_fitting_words(
        clues: Clues,
        candidate_words: typing.Iterable[str]) -> typing.Iterator[str]:
    for word in candidate_words:
        if is_word_fitting(clues, word): yield word


# Example:
target = 'melee'
clues = Clues()
print(f'{target=}')
guess = 'quest'
print(f'{guess=} {is_word_fitting(clues, guess)=}')
clues = combine_clues(clues, judge(target, guess))
print(f'{clues=}')  # --^--
guess = 'panel'
print(f'{guess=} {is_word_fitting(clues, guess)=}')
clues = combine_clues(clues, judge(target, guess))
print(f'{clues=}')  # ---#^
guess = 'elder'
print(f'{guess=} {is_word_fitting(clues, guess)=}')
clues = combine_clues(clues, judge(target, guess))
print(f'{clues=}')  # ^^-#-
guess = 'levee'
print(f'{guess=} {is_word_fitting(clues, guess)=}')
clues = combine_clues(clues, judge(target, guess))
print(f'{clues=}')  # ^#-##


target='melee'
guess='quest' is_word_fitting(clues, guess)=True
clues=frozendict.frozendict({'e': LetterClue(min_count=1, bounded=False, in_positions=frozenset(), not_in_positions=frozenset({2})), 'q': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset()), 't': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset()), 'u': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset()), 's': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset())})
guess='panel' is_word_fitting(clues, guess)=True
clues=frozendict.frozendict({'e': LetterClue(min_count=1, bounded=False, in_positions=frozenset({3}), not_in_positions=frozenset({2})), 'q': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset()), 't': LetterClue(min_count=0, bounded=True, in_positions=frozenset(), not_in_positions=frozenset()), 'u': LetterClue(m

In [12]:
trivial_plurals = set()
for word in guessable_words:
    if not word.endswith('s') or word.endswith('ss'): continue
    base = word[:-1]
    if (unigram_frequencies.get(base, 0) <=
            0.5 * unigram_frequencies.get(word, 0)):
        # Assume that any word is always "around as common as" its trivial plural. This is to filter out stuff like "oasis" from being a trivial plural, where "oasi" is also in the dictionary somehow.
        continue
    trivial_plurals.add(word)

'oasis' in trivial_plurals, 'bonus' in trivial_plurals, 'gears' in trivial_plurals, 'sofas' in trivial_plurals


(False, False, True, True)

In [13]:
OFFENSIVE_WORDS_FILE = './useful_stats/offensive_words.txt'

# I had a surprisingly difficult time finding a good list for these. The list in the current file also includes fairly normal words like "spook" which I don't see why couldn't be a real Wordle answer. For now this list is unused.

offensive_words = set()
# alphabet = set(itertools.chain(*guessable_words))
# for line in open(OFFENSIVE_WORDS_FILE, 'r').readlines():
#     line = line.strip()
#     if len(line) != TARGET_LENGTH: continue
#     if any(c not in alphabet for c in line): continue
#     offensive_words.add(line)

offensive_words, len(offensive_words)


(set(), 0)

In [75]:
@functools.cache
def target_probability(word: str) -> float:
    if word in trivial_plurals or word in offensive_words: return 0.0
    if word not in unigram_frequencies: return 0.0
    return scipy.special.expit((unigram_frequencies[word] - 4E5) / 1.5E5)

@functools.cache
def familiar_probability(word: str) -> float:
    if word in trivial_plurals or word in offensive_words: return 0.0
    if word not in unigram_frequencies: return 0.0
    return scipy.special.expit((unigram_frequencies[word] - 16E5) / 4E5)

target = 'spice'
guesses = ['coast']
# target = 'scrub'
# guesses = ['roach']

clues = Clues()
for guess in guesses:
    clues = combine_clues(clues, judge(target, guess))
candidates = []
for word in gen_fitting_words(clues, guessable_words):
    p = target_probability(word)
    if p > 0: candidates.append((word, p, familiar_probability(word)))
candidates.sort(reverse=True, key=lambda x: x[1])
candidates, len(candidates), sum(x[1] > 0.5 for x in candidates)

([('music', 1.0, 1.0),
  ('scene', 1.0, 1.0),
  ('screw', 1.0, 0.9999999277517385),
  ('since', 1.0, 1.0),
  ('specs', 1.0, 1.0),
  ('spice', 1.0, 0.9999989850650284),
  ('slice', 0.9999999999994293, 0.9994855756537293),
  ('spicy', 0.9999999905013234, 0.9806788085967384),
  ('hicks', 0.9999991872704823, 0.9053876348115549),
  ('slick', 0.9999980562574173, 0.8734247516346668),
  ('scrub', 0.9999978896613175, 0.8699761044913237),
  ('psych', 0.9989085760072909, 0.3910722019058516),
  ('speck', 0.9157853674908787, 0.10860259956113578),
  ('feces', 0.8314968874123442, 0.08306619219055851),
  ('synch', 0.7956730034022881, 0.07654805846378952),
  ('mucus', 0.7919049897404956, 0.07594006665008601),
  ('sucre', 0.7557060011706569, 0.0706649261715832),
  ('scrum', 0.5430101090651521, 0.050434693807072876),
  ('scrip', 0.38355328475759504, 0.04000456333479656),
  ('ficus', 0.3084121104473478, 0.03547414559351664),
  ('snuck', 0.30223025699759926, 0.035102060537835315),
  ('scuff', 0.23706351796