# Words with more than 5 letters

## Create Dataset

In [10]:
len2list = [list() for _ in range(32)]
for line in open('wordle/vocabulary.txt', 'r'):
    word = line.rstrip()
    len2list[len(word)].append(word)

for i, lst in enumerate(len2list):
    val = len(lst)
    if val != 0:
        print(i, val)

1 26
2 427
3 2130
4 7186
5 15920
6 29874
7 41998
8 51627
9 53402
10 45872
11 37539
12 29125
13 20944
14 14149
15 8846
16 5182
17 2967
18 1471
19 760
20 359
21 168
22 74
23 31
24 12
25 8
27 3
28 2
29 2
31 1


In [11]:
from wordle.wordlenp import Wordle


gue = Wordle._load_vocabulary('wordle/guesses.txt', astype=list)
ans = Wordle._load_vocabulary('wordle/answers.txt', astype=list)

ratio = len(ans) / len(gue)
ratio

0.1784613012642615

In [14]:
import numpy as np
from math import ceil


for i, lst in enumerate(len2list):
    val = len(lst)
    if val == 0:
        continue
    
    guesses = np.array(lst)
    answers_ind = np.random.choice(len(guesses), size=ceil(len(guesses)*ratio), replace=False)
    answers = np.sort(guesses[answers_ind])

    open(f'wordle/guesses-{i}.txt', 'w').write('\n'.join(guesses))
    open(f'wordle/answers-{i}.txt', 'w').write('\n'.join(answers))

## Load Data

In [23]:
import bisect


def load(n_letters):
    answers = Wordle._load_vocabulary(f'wordle/answers-{n_letters}.txt', astype=list)
    guesses = Wordle._load_vocabulary(f'wordle/guesses-{n_letters}.txt', astype=np.array)
    wordle_list = guesses.copy().tolist()

    in_answers = []
    for i, word in enumerate(guesses):
        loc = bisect.bisect_left(answers, word)
        if len(answers) > loc and answers[loc] == word:
            in_answers.append(i)

    indices = np.arange(len(guesses))
    
    np.random.seed(0)
    np.random.shuffle(indices)

    return wordle_list, guesses, answers, indices, in_answers

In [24]:
from collections import defaultdict
from glob import glob
import re

wordle_list = defaultdict(list)
guesses = defaultdict(list)
answers = defaultdict(list)
indices = defaultdict(list)
in_answers = defaultdict(list)

answers_files = glob('wordle/answers-*.txt')
guesses_files = glob('wordle/guesses-*.txt')

for answers_file, guesses_file in zip(answers_files, guesses_files):
    n_letters = int(re.split(r'[-\.]', answers_file)[-2])
    for lst, dct in zip(load(n_letters), [wordle_list, guesses, answers, indices, in_answers]):
        dct[n_letters] = lst

In [None]:
def train_test_split(n_guesses, overfit, indices, in_answers):
    guesses_cur = guesses[indices[:n_guesses]]
    
    train_indices = []
    test_indices = []
    for i_guess in indices[:n_guesses]:
        if i_guess in in_answers:
            test_indices.append(i_guess)
        else:
            train_indices.append(i_guess)

    if overfit:
        train_answers_cur = guesses[test_indices]
    else:
        train_answers_cur = guesses[train_indices]
    
    test_answers_cur = guesses[test_indices]

    print(
        f'guesses: {len(guesses_cur)}',
        f'train answers: {len(train_answers_cur)}',
        f'test answers: {len(test_answers_cur)}' + (' (overfit strategy)' if overfit else ''),
        sep='\n'
    )

    return train_answers_cur, test_answers_cur, guesses_cur

## Train Configuration

In [None]:
%load_ext autoreload
%autoreload 2

from functools import partial
from collections import defaultdict
import pickle

from wordle.wordlenp import Wordle
from environment.environment import Environment, StateYesNo, StateVocabulary
from environment.action import ActionVocabulary, ActionLetters, ActionCombLetters
from dqn.agent import Agent
from dqn.train import Trainer
from replay_buffer.cpprb import PrioritizedReplayBuffer, ReplayBuffer

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
import torch
import numpy as np
np.random.seed(0)

"cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
ohe1 = ActionCombLetters(vocabulary=wordle_list, k=1).ohe_matrix
ohe2 = ActionCombLetters(vocabulary=wordle_list, k=2).ohe_matrix
print(ohe1.shape, ohe2.shape)
step_rewards = {'B':0, 'Y':1, 'G':1, 'win':10, 'lose':-10, 'step':-5}
tasks_results = defaultdict(dict)

In [None]:
def get_data(n_guesses, n_letters, overfit):
    return Trainer.train_test_split(n_guesses, overfit, guesses[n_letters], indices[n_letters], in_answers[n_letters])

## Example for 7 letters

In [None]:
n_guesses = 2000
n_letters = 7
data = get_data(n_guesses, n_letters, overfit=True)

In [None]:
nickname = Trainer.train_comb_letters(
    data=data,
    
    n_batches=40000,
    n_batches_warm=10,
    
    eps_start=1,
    eps_end=0.01,
    eps_decay=0.95,
    
    n_envs=8,
    k=1, 
    optimize_interval=8,

    agent_path=None
)