# Import Corpus

In [1]:
import nltk
from nltk import FreqDist
from collections import defaultdict
import pandas as pd
import numpy as np

In [2]:
nltk.download('brown')
from nltk.corpus import brown
frequency_list = FreqDist(i.lower() for i in brown.words())
frequency_dict = dict(frequency_list.most_common())

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Wordle Solver Utils

In [5]:
# we can only consider five-letter-words with unique characters
def is_valid_wordle(s):
  return (len(s) == 5) and (len(list(s)) == len(set(s))) and s.isalpha()

# initialize candidates from corpus
def initialize_candidates():
  return [s for s, _ in frequency_list.most_common() if is_valid_wordle(s)]

# create a df with of character counts by position
def initialize_wordle_df(candidates):
  df = pd.DataFrame(candidates, columns=['words'])
  df['words'] = df['words'].map(list)
  df = pd.DataFrame(df["words"].to_list(), columns=['c1', 'c2', 'c3', 'c4', 'c5'])
  rows = [df[column].value_counts() for column in df.columns]
  return pd.DataFrame(rows).fillna(0).astype(int)

# there is a prior distribution over the chars by position; we choose the
# candidate that maximizes the joint probability across all positions
def choose_wordle_candidate(wordle_df):
  max_score = 0
  total_score = 0
  best_candidate = None
  for candidate in candidates:
    score = np.diag(wordle_df.loc[wordle_df.index, list(candidate)]).sum()
    if (score > max_score) or (best_candidate is None) \
      or (score == max_score and frequency_dict[candidate] > frequency_dict[best_candidate]):
      max_score = score
      best_candidate = candidate
    total_score += score
  return best_candidate, max_score, total_score

# use the result array to prune the remaining candidates
def prune_wordle_df(candidates, word, result):
  required_chars = {c for i, c in enumerate(word) if result[i] >= 1}
  invalid_chars = {c for i, c in enumerate(word) if result[i] == 0}

  valid_words = []
  for candidate in candidates:
    # candidate cannot contain any invalid chars
    if len(invalid_chars.intersection(set(candidate))) > 0:
      continue

    # candidate must contain any valid chars
    if len(required_chars.intersection(set(candidate))) != len(required_chars):
      continue

    is_valid = True
    for i, c in enumerate(word):
      # character cannot exactly match if partial match
      if candidate[i] == word[i] and result[i] == 1:
        is_valid = False
        break
      # character must exactly match if perfect match
      if candidate[i] != word[i] and result[i] == 2:
        is_valid = False
        break

    if not is_valid:
      continue

    valid_words.append(candidate)
  return valid_words


def query_wordle(best_candidate):
  return wordle_cache[best_candidate]

# Run Solver Below

In [6]:
candidates = initialize_candidates()
while True:
  wordle_df = initialize_wordle_df(candidates)
  if len(candidates) == 0:
      print("Something went wrong, there are no candidates left...")
      print("Aborting...")
      break

  best_candidate, score, total_score = choose_wordle_candidate(wordle_df)
  print(
      f"\nCandidates remaining = {len(candidates)}"
      f"\nBest Candidate: {best_candidate}"
      f"\nLikelihood: {(100 * score / total_score):.2f}%")

  # parse chosen word from user
  is_valid_word = False
  while not is_valid_word:
    word = input(f"\nEnter your chosen five-letter word (i.e. {best_candidate}): ")
    is_valid_word = (len(word) == 5)
    if not is_valid_word:
      print("Improper Input!")

  print(f"Likelihood: {100 * np.diag(wordle_df.loc[wordle_df.index, list(word)]).sum() / total_score:.2f}%")

  # parse chosen result from user
  is_valid_result = False
  while not is_valid_result:
    result = input(
        "\n0 = Miss, 1 = Partial Match, 2 = Perfect Match "
        "\nEnter your result as a comma-separated list "
        "(i.e. 0,0,0,1,2): ")
    result = list(map(int, "".join(result.split()).split(",")))
    is_valid_result = (len(result) == 5) and all([i >= 0 and i <= 2 for i in result])
    if not is_valid_result:
      print("Improper Input!")

  # you won wordle
  if all([i == 2 for i in result]):
    print(f"Congrats! The word was {word}")
    break

  candidates = prune_wordle_df(candidates, word, result)


Candidates remaining = 2684
Best Candidate: cares
Likelihood: 0.07%

Enter your chosen five-letter word (i.e. brown): cares
Likelihood: 0.07%

0 = Miss, 1 = Partial Match, 2 = Perfect Match 
Enter your result as a comma-separated list (i.e. 0,0,0,1,2): 0,0,1,0,0

Candidates remaining = 72
Best Candidate: front
Likelihood: 2.08%

Enter your chosen five-letter word (i.e. brown): front
Likelihood: 2.08%

0 = Miss, 1 = Partial Match, 2 = Perfect Match 
Enter your result as a comma-separated list (i.e. 0,0,0,1,2): 0,2,0,2,0

Candidates remaining = 6
Best Candidate: drink
Likelihood: 18.03%

Enter your chosen five-letter word (i.e. brown): drink
Likelihood: 18.03%

0 = Miss, 1 = Partial Match, 2 = Perfect Match 
Enter your result as a comma-separated list (i.e. 0,0,0,1,2): 2,2,2,2
Improper Input!

0 = Miss, 1 = Partial Match, 2 = Perfect Match 
Enter your result as a comma-separated list (i.e. 0,0,0,1,2): 2,2,2,2,2
Congrats! The word was drink
