# process_dictionary

Process a dictionary file in json format and extract all words between `min_word_length` and `max_word_length`. 

This produces a json file that contains the following:

* dictonary of all matching words with
    - score for 1, 2, 3 letter frequency matches 


In [1]:
word_list = 'valid-wordle-words.txt'
local_path = './'
min_word_length = 5
max_word_length = 5
max_letter_combinations = 3
rounding_places = 8


In [2]:
from pathlib import Path
import json
import random
from collections import OrderedDict
from operator import getitem
from itertools import product 
import csv

In [3]:
try:
  # from google.colab import drive
  # drive.mount('/content/gdrive/', force_remount=True)
  from pydrive.auth import GoogleAuth
  from google.colab import drive
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials

  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  file_id = {'id': "1-4QvbyHQAJoJTzk-LbU_nJpmW-I_eK-X"}
  fileDownloaded = drive.CreateFile(file_id)
  fileDownloaded.GetContentFile(word_list)

except ImportError as e:
  word_file = 'local_path' + word_list

word_file = Path(local_path)/word_list

In [4]:
def range_char(start, stop):
    '''generator for all characters in range `start:stop`
    
    works effectively for upper:upper or lower:lower case letters'''
    return (chr(n) for n in range(ord(start), ord(stop) + 1))

In [5]:
def sample_dict(d, n):
    r = {}
    for i in random.sample(sorted(d), n):
        r[i] = d[i]
    return r

In [6]:
def flatten_dict(d, h):
    fd = []

    for key, values in d.items():
        entry = {}
        entry[h] = key
        entry.update(values)
        fd.append(entry)
        
    return fd

In [7]:
all_words = {}
with open(word_file, 'r') as f:
    for line in f:
        all_words[line.strip()] = len(line.strip())
    
    

In [8]:
# sort dictionary by word length
new_d = {}
for k in sorted(all_words, key=len, reverse=False):
    new_d[k] = all_words[k]


In [9]:
accepted_words = {}
for k in new_d:
    # pull words of matching length
    if len(k) >= min_word_length and len(k) <= max_word_length:
        accepted_words[k] = {}
        
    # stop after first word that is too long
    if len(k) > max_word_length:
        break

In [10]:
# create dictionary of letter combination dictionaries with all cartesian product of characters a..z:
c_product_dicts = {}
for length in range(1, max_letter_combinations+1):
    # create a dictionary for each length
    c_product_dicts[length] = {}
    for i in product(range_char('a', 'z'), repeat = length):
        c_product_dicts[length][''.join(i)] = {'total': 0, 'percent': 0, 'score': None}

In [11]:
for word in accepted_words:
    for i, letter in enumerate(word):
        for j in range(1, max_letter_combinations+1):
            if i+j > len(word):
                pass
            else:
                w_slice=(f'{word[i:i+j]}')
                c_product_dicts[j][w_slice]['total'] += 1

In [12]:
dictionary_bins = {}
for dictionary, letter_space in c_product_dicts.items():
    # create bins for values
    dictionary_bins[dictionary] = {'total': 0, 'bins': []}
    # sum up the total occurences of each value in the letter space
    for key, value in letter_space.items():
        dictionary_bins[dictionary]['total'] += value['total']
    # assign a percent score for each value
    value_list = []
    for key, value in letter_space.items():
        value['percent'] = round(value['total']/dictionary_bins[dictionary]['total'], rounding_places)
        value_list.append(value['percent'])
    # re-order the space by highest percent value first
    c_product_dicts[dictionary] = OrderedDict(
        sorted(letter_space.items(), key=lambda x: getitem(x[1], 'percent'), reverse=True))
    dictionary_bins[dictionary]['bins'] = sorted(set(value_list), reverse=True)
    
    # score each value in letter_space using bin index (lower scores are better)
    for key, value in letter_space.items():
        value['score'] = dictionary_bins[dictionary]['bins'].index(value['percent'])

In [13]:
# pull N random words from dictionary

# test_words = sample_dict(accepted_words, 10)
# test_words

In [14]:
# this is expensive to run!
# calculate the score of each word for each value in the letter spaces
for word, data in accepted_words.items():
    for dictionary, letter_space in c_product_dicts.items():
        score = 0
        for item in letter_space:
            count = word.count(item)
            score += letter_space[item]['score'] * count           
        data[dictionary] = score

In [15]:
# create CSV friendly rendering of frequency data
freqeuncy_csv = []
for d, values in c_product_dicts.items():
    freqeuncy_csv.extend(flatten_dict(values, 'letter'))
accepted_words_csv = flatten_dict(accepted_words, 'word')

# remove all words with duplicate characters
accepted_words_no_dupes_csv = []
for word in accepted_words_csv:
    if len(set(word['word'])) == len(word['word']):
        accepted_words_no_dupes_csv.append(word)
        

# sort based lowest score for first three keys
accepted_words_sorted_csv = sorted(accepted_words_csv, key=lambda d: (d[1], d[2], d[3]))
accepted_words_no_dupes_sorted_csv = sorted(accepted_words_no_dupes_csv, key=lambda d: (d[1], d[2], d[3]))


In [16]:
csv_files = [
    {'file_name': 'frequency.csv', 'var': freqeuncy_csv, 'fieldnames': freqeuncy_csv[0].keys()},
    {'file_name': 'accepted_words.csv', 'var': accepted_words_csv, 'fieldnames': accepted_words_csv[0].keys()},
    {'file_name': 'accepted_words_sorted.csv', 'var': accepted_words_sorted_csv, 'fieldnames': accepted_words_sorted_csv[0].keys()},
    {'file_name': 'accepted_words_no_dupes_sorted.csv', 'var': accepted_words_no_dupes_sorted_csv, 'fieldnames': accepted_words_no_dupes_csv[0].keys()}
]

In [17]:
for file in csv_files:
    with open(file['file_name'], 'w') as csv_file:
        writer = csv.DictWriter(csv_file, delimiter=',', fieldnames=file['fieldnames'])
        writer.writeheader()
        writer.writerows(file['var'])
    

In [None]:
# def string_to_dict(w, c=1):
#     '''break string w into sequential fragments of size c
    
#     Returns: dictionary'''
#     d = {}
#     for i, v in enumerate(w1):
#         if i+c > len(w1):
#             break
#         else:
#             d[i] = w[i:i+c]
#     return d

In [18]:
answer = 'flora'
# enter the results of a guess in order using 0 for no match, 1 for partial and 2 for full
guess = {'o': 1, 'p': 0, 'e': 0, 'r': 2, 'a': 2} 

In [24]:
answer = 'pitas'
guess = {'i': 1, 'a': 1, 'p': 1}
guess = {'p': 2, 'i': 2, 'l': 0, 'a': 2, 'f': 0}

In [25]:
# check for possible matching words based on guess
possible_matches = []

# create a reference index of the dictionary
guess_index = list(guess.keys())

# select only letters that are matches
check = {}

for k, v in guess.items():
    if v > 0:
        check[k] = v

for word in accepted_words:
    # basic check that word has all the letters in any order
    if all(x in word for x in list(check.keys())):
        fail = False
        for letter, value in check.items():
            if guess_index.index(letter) != word.index(letter) and value == 2:
                fail = True
                break
            if guess_index.index(letter) == word.index(letter) and value == 1:
                fail = True 
                break
#             
        if not fail:
            possible_matches.append(word)
                

print(possible_matches)

['pibal', 'pical', 'picas', 'pikas', 'pikau', 'pilae', 'pilaf', 'pilao', 'pilar', 'pilau', 'pilaw', 'pimas', 'pinas', 'pipal', 'pipas', 'pirai', 'pitas']


In [21]:
print(accepted_words)

{'aahed': {1: 30, 2: 411, 3: 183}, 'aalii': {1: 20, 2: 456, 3: 161}, 'aargh': {1: 39, 2: 532, 3: 171}, 'aarti': {1: 20, 2: 390, 3: 147}, 'abaca': {1: 35, 2: 274, 3: 159}, 'abaci': {1: 38, 2: 390, 3: 162}, 'aback': {1: 51, 2: 312, 3: 120}, 'abacs': {1: 33, 2: 425, 3: 166}, 'abaft': {1: 47, 2: 481, 3: 165}, 'abaka': {1: 41, 2: 353, 3: 163}, 'abamp': {1: 48, 2: 312, 3: 150}, 'aband': {1: 39, 2: 251, 3: 119}, 'abase': {1: 22, 2: 174, 3: 94}, 'abash': {1: 36, 2: 213, 3: 107}, 'abask': {1: 39, 2: 273, 3: 126}, 'abate': {1: 29, 2: 195, 3: 104}, 'abaya': {1: 34, 2: 399, 3: 158}, 'abbas': {1: 38, 2: 340, 3: 143}, 'abbed': {1: 47, 2: 337, 3: 159}, 'abbes': {1: 37, 2: 333, 3: 144}, 'abbey': {1: 48, 2: 448, 3: 177}, 'abbot': {1: 46, 2: 387, 3: 173}, 'abcee': {1: 33, 2: 446, 3: 198}, 'abeam': {1: 36, 2: 226, 3: 145}, 'abear': {1: 26, 2: 185, 3: 106}, 'abele': {1: 27, 2: 188, 3: 141}, 'abers': {1: 24, 2: 216, 3: 103}, 'abets': {1: 27, 2: 228, 3: 150}, 'abhor': {1: 41, 2: 375, 3: 171}, 'abide': {1: 3

In [None]:
!jupyter nbconvert --to python --template python_clean process_dictionary.ipynb
