# process_dictionary

Process a dictionary file in json format and extract all words between `min_word_length` and `max_word_length`. 

This produces a json file that contains the following:

* dictonary of all matching words with
    - score for 1, 2, 3 letter frequency matches 


In [60]:
word_list = './valid-wordle-words.txt'
min_word_length = 5
max_word_length = 5
max_letter_combinations = 3
rounding_places = 8


In [153]:
import json
import random
from collections import OrderedDict
from operator import getitem
from itertools import product 
import csv

In [73]:
# local
from pathlib import Path
word_file = Path(word_list)


In [4]:
def range_char(start, stop):
    '''generator for all characters in range `start:stop`
    
    works effectively for upper:upper or lower:lower case letters'''
    return (chr(n) for n in range(ord(start), ord(stop) + 1))

In [207]:
def sample_dict(d, n):
    r = {}
    for i in random.sample(sorted(d), n):
        r[i] = d[i]
    return r

In [165]:
def flatten_dict(d, h):
    fd = []

    for key, values in d.items():
        entry = {}
        entry[h] = key
        entry.update(values)
        fd.append(entry)
        
    return fd

In [80]:
all_words = {}
with open(word_file, 'r') as f:
    for line in f:
        all_words[line.strip()] = len(line.strip())
    
    

In [82]:
# sort dictionary by word length
new_d = {}
for k in sorted(all_words_dict, key=len, reverse=False):
    new_d[k] = all_words_dict[k]


In [84]:
accepted_words = {}
for k in new_d:
    # pull words of matching length
    if len(k) >= min_word_length and len(k) <= max_word_length:
        accepted_words[k] = {}
        
    # stop after first word that is too long
    if len(k) > max_word_length:
        break

In [9]:
# create dictionary of letter combination dictionaries with all cartesian product of characters a..z:
c_product_dicts = {}
for length in range(1, max_letter_combinations+1):
    # create a dictionary for each length
    c_product_dicts[length] = {}
#     for i in permutations(''.join(range_char('a', 'z')), length):
    for i in product(range_char('a', 'z'), repeat = length):
        c_product_dicts[length][''.join(i)] = {'total': 0, 'percent': 0, 'score': None}

In [90]:

for word in accepted_words:
    for i, letter in enumerate(word):
        for j in range(1, max_letter_combinations+1):
            if i+j > len(word):
                pass
            else:
                w_slice=(f'{word[i:i+j]}')
                c_product_dicts[j][w_slice]['total'] += 1

In [129]:
dictionary_bins = {}
for dictionary, letter_space in c_product_dicts.items():
    # create bins for values
    dictionary_bins[dictionary] = {'total': 0, 'bins': []}
    # sum up the total occurences of each value in the letter space
    for key, value in letter_space.items():
        dictionary_bins[dictionary]['total'] += value['total']
    # assign a percent score for each value
    value_list = []
    for key, value in letter_space.items():
        value['percent'] = round(value['total']/dictionary_bins[dictionary]['total'], rounding_places)
        value_list.append(value['percent'])
    # re-order the space by highest percent value first
    c_product_dicts[dictionary] = OrderedDict(
        sorted(letter_space.items(), key=lambda x: getitem(x[1], 'percent'), reverse=True))
    dictionary_bins[dictionary]['bins'] = sorted(set(value_list), reverse=True)
    
    # score each value in letter_space using bin index (lower scores are better)
    for key, value in letter_space.items():
        value['score'] = dictionary_bins[dictionary]['bins'].index(value['percent'])

In [98]:
# consider stripping out all values in letter space with score of 0

In [170]:
# pull N random words from dictionary

# test_words = sample_dict(accepted_words, 10)
# test_words

In [177]:
# this is expensive to run!
# calculate the score of each word for each value in the letter spaces
for word, data in accepted_words.items():
    for dictionary, letter_space in c_product_dicts.items():
        score = 0
        for item in letter_space:
            count = word.count(item)
            score += letter_space[item]['score'] * count           
        data[dictionary] = score

In [286]:
# create CSV friendly rendering of frequency data
freqeuncy_csv = []
for d, values in c_product_dicts.items():
    freqeuncy_csv.extend(flatten_dict(values, 'letter'))
accepted_words_csv = flatten_dict(accepted_words, 'word')

# remove all words with duplicate characters
accepted_words_no_dupes_csv = []
for word in accepted_words_csv:
    if len(set(word['word'])) == len(word['word']):
        accepted_words_no_dupes_csv.append(word)
        

# sort based lowest score for first three keys
accepted_words_sorted_csv = sorted(accepted_words_csv, key=lambda d: (d[1], d[2], d[3]))
accepted_words_no_dupes_sorted_csv = sorted(accepted_words_no_dupes_csv, key=lambda d: (d[1], d[2], d[3]))


In [287]:
accepted_words_no_dupes_sorted_csv

[{'word': 'oreas', 1: 10, 2: 65, 3: 85},
 {'word': 'arose', 1: 10, 2: 74, 3: 92},
 {'word': 'seora', 1: 10, 2: 225, 3: 165},
 {'word': 'arise', 1: 12, 2: 55, 3: 83},
 {'word': 'serai', 1: 12, 2: 76, 3: 112},
 {'word': 'aries', 1: 12, 2: 84, 3: 111},
 {'word': 'osela', 1: 12, 2: 85, 3: 115},
 {'word': 'alose', 1: 12, 2: 86, 3: 114},
 {'word': 'raise', 1: 12, 2: 101, 3: 141},
 {'word': 'solea', 1: 12, 2: 180, 3: 97},
 {'word': 'aloes', 1: 12, 2: 203, 3: 174},
 {'word': 'aesir', 1: 12, 2: 297, 3: 203},
 {'word': 'lares', 1: 13, 2: 16, 3: 28},
 {'word': 'rales', 1: 13, 2: 21, 3: 71},
 {'word': 'seral', 1: 13, 2: 28, 3: 114},
 {'word': 'laser', 1: 13, 2: 35, 3: 89},
 {'word': 'lears', 1: 13, 2: 137, 3: 68},
 {'word': 'reals', 1: 13, 2: 138, 3: 121},
 {'word': 'slare', 1: 13, 2: 147, 3: 69},
 {'word': 'arles', 1: 13, 2: 173, 3: 128},
 {'word': 'arsle', 1: 13, 2: 233, 3: 152},
 {'word': 'aotes', 1: 13, 2: 270, 3: 134},
 {'word': 'earls', 1: 13, 2: 292, 3: 131},
 {'word': 'stoae', 1: 13, 2: 34

In [280]:
csv_files = {
    'frequency.csv': freqeuncy_csv,
    'accepted_words.csv': accepted_words_csv,
    'accepted_words_sorted.csv': accepted_words_sorted_csv,
    'accepted_words_no_dupes_sorted.csv': accepted_words_no_dupes_csv
}

In [205]:
# write out CSV files
with open('frequency.csv', 'w') as csv_file:
    writer = csv.DictWriter(csv_file, delimiter=',', fieldnames=freqeuncy_csv[0].keys())
    writer.writeheader()
    writer.writerows(freqeuncy_csv)
    

with open('accepted_words.csv', 'w') as csv_file:
    writer = csv.DictWriter(csv_file, delimiter=',', fieldnames=accepted_words_csv[0].keys())
    writer.writeheader()
    writer.writerows(accepted_words_csv)

In [211]:
test_dict = sample_dict(accepted_words, 10)

In [213]:
flat_test = flatten_dict(sample_dict(accepted_words, 10), 'word')
flat_test

[{'word': 'yucca', 1: 43, 2: 583, 3: 209},
 {'word': 'flang', 1: 50, 2: 205, 3: 66},
 {'word': 'devon', 1: 43, 2: 371, 3: 218},
 {'word': 'mould', 1: 41, 2: 350, 3: 178},
 {'word': 'japer', 1: 42, 2: 258, 3: 130},
 {'word': 'engin', 1: 39, 2: 196, 3: 194},
 {'word': 'veeps', 1: 39, 2: 306, 3: 189},
 {'word': 'fetes', 1: 30, 2: 188, 3: 146},
 {'word': 'shack', 1: 46, 2: 217, 3: 115},
 {'word': 'delim', 1: 35, 2: 162, 3: 132}]

In [221]:
nl = sorted(flat_test, key=lambda d: (d[1], d[2], d[3]))

In [288]:
c_product_dicts[1]

OrderedDict([('a', {'total': 33568, 'percent': 0.10544038, 'score': 0}),
             ('e', {'total': 31200, 'percent': 0.09800226, 'score': 1}),
             ('s', {'total': 26148, 'percent': 0.08213343, 'score': 2}),
             ('o', {'total': 20876, 'percent': 0.06557356, 'score': 3}),
             ('r', {'total': 20572, 'percent': 0.06461867, 'score': 4}),
             ('i', {'total': 20268, 'percent': 0.06366378, 'score': 5}),
             ('l', {'total': 16984, 'percent': 0.05334841, 'score': 6}),
             ('t', {'total': 16756, 'percent': 0.05263224, 'score': 7}),
             ('n', {'total': 16172, 'percent': 0.05079784, 'score': 8}),
             ('u', {'total': 13444, 'percent': 0.04222892, 'score': 9}),
             ('d', {'total': 11244, 'percent': 0.03531851, 'score': 10}),
             ('c', {'total': 10976, 'percent': 0.03447669, 'score': 11}),
             ('y', {'total': 10084, 'percent': 0.03167483, 'score': 12}),
             ('m', {'total': 9976, 'percent': 0.