# Implementing DQN with embeddings

## Input Convex Network

forward pass computes $Q(s,a)$ and maximizes it by one of inputs using L-BFGS

In [1]:
from dqn.model import ConvexQNetwork


model = ConvexQNetwork(
    state_size=300,
    emb_size=3,
    hidden_size=256,
    optim_steps=20
)

In [3]:
import torch


qfunc, a = model.forward(torch.randn(3, 300))

In [4]:
qfunc

tensor([[ 2.0559],
        [10.5624],
        [ 2.9934]], grad_fn=<MulBackward0>)

In [5]:
a

tensor([[ -0.9577,  -0.5398,  -3.1173],
        [ -9.1446,  -9.7397, -10.8742],
        [ -1.4427,  -1.6735,  -4.2116]])

In [6]:
model.conv

[[7.000731468200684,
  -0.034524381160736084,
  -0.12732195854187012,
  -0.22011947631835938,
  -0.3129172921180725,
  -0.4057149887084961,
  -0.49851250648498535,
  -0.5913101434707642,
  -0.6718336343765259,
  -0.8345907926559448,
  -0.9973480701446533,
  -1.1601054668426514,
  -1.3228628635406494,
  -1.4856199026107788,
  -1.6483771800994873,
  -1.7574591636657715,
  -1.8171570301055908,
  -1.876854658126831,
  -1.9365522861480713,
  -1.9962501525878906],
 [319.8938903808594,
  197.17987060546875,
  -9.929808616638184,
  -9.96495532989502,
  -10.000100135803223,
  -10.035244941711426,
  -10.070390701293945,
  -10.105536460876465,
  -10.140681266784668,
  -10.175826072692871,
  -10.210970878601074,
  -10.246116638183594,
  -10.281262397766113,
  -10.316407203674316,
  -10.35155200958252,
  -10.386697769165039,
  -10.421843528747559,
  -10.456988334655762,
  -10.492134094238281,
  -10.527278900146484],
 [13.005784034729004,
  1.4321315288543701,
  0.3737785220146179,
  -1.026635050773

## word2vec embeddings

by given word predict another word with shared letters

### torch dataset

In [102]:
from environment.action import WordPairsDataset
from wordle.wordlenp import Wordle
import numpy as np

vocabulary = Wordle._load_vocabulary('wordle/guesses.txt', astype=np.array)
data = WordPairsDataset(vocabulary, 'word_pairs_dataset', generate=False)

In [108]:
len(data), data[2]

204834634

### model

In [2]:
from environment.action import Embedding


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model = Embedding().to(device)
loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters())

### train

In [120]:
from torch.utils.data import DataLoader


dataloader = DataLoader(
    dataset=data,
    batch_size=64,
    shuffle=True,
    num_workers=2
)

model.train_epoch(dataloader, loss_fn, optimizer, device)

cpu


### save

In [None]:
torch.save(model.state_dict(), 'embedding_model.pth')

# LETS GO

In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict
import pickle

from wordle.wordlenp import Wordle
from environment.environment import Environment, StateYesNo, StateVocabulary
from environment.action import ActionEmbedding, ActionLetters
from dqn.agent import Agent
from dqn.train import Trainer
from replay_buffer.cpprb import PrioritizedReplayBuffer, ReplayBuffer

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
import torch
import numpy as np
np.random.seed(0)

"cuda:0" if torch.cuda.is_available() else "cpu"

'cpu'

In [2]:
word_list = Wordle._load_vocabulary('wordle/guesses.txt', astype=np.array)

def make_data(n_answers, n_guesses):
    guesses = np.random.choice(word_list, size=n_guesses, replace=False)
    answers = np.random.choice(guesses, size=n_answers, replace=False)
    return answers, guesses

answers_10_100, guesses_10_100 = make_data(10, 100)
answers_100_100, guesses_100_100 = make_data(100, 100)
step_rewards = {'B':0, 'Y':1, 'G':1, 'win':10, 'lose':-10, 'step':-5}
tasks_results = defaultdict(dict)

In [3]:
def experiment(answers, guesses, n_batches, n_batches_warm):
    env_list = []
    for _ in range(8):
        env = Environment(
            rewards=step_rewards,
            wordle=Wordle(vocabulary=guesses, answers=answers),
            state_instance=StateYesNo()
        )
        env_list.append(env)

    agent = Agent(
        state_size=env.state.size,
        action_instance=ActionEmbedding(vocabulary=guesses, emb_size=10),
        replay_buffer=ReplayBuffer(state_size=env.state.size, batch_size=4),
        optimize_interval=2
    )

    trainer = Trainer(
        env_list[0], agent,
        n_batches=n_batches,
        n_batches_warm=n_batches_warm,
        is_parallel=False,
    )
    
    res = trainer.train(eps_decay=0.99, nickname=f'embtest-{len(answers)}-{len(guesses)}')
    return res

In [4]:
experiment(
    answers_10_100, guesses_10_100,
    n_batches=200, n_batches_warm=50,
)

WARM BATCHES:   0%|          | 0/50 [00:00<?, ?it/s]

TRAIN BATCHES:   0%|          | 0/200 [00:00<?, ?it/s]

  mean_steps = steps[success.astype(bool)].mean()
  ret = ret.dtype.type(ret / rcount)



Batch   25	Time: 12 s	Agent Eps: 0.78	Train Win Rate: 0.00%	Test Win Rate: 0.00%	Test Mean Steps: nan

Batch   50	Time: 26 s	Agent Eps: 0.61	Train Win Rate: 4.00%	Test Win Rate: 0.00%	Test Mean Steps: nan

Batch   75	Time: 41 s	Agent Eps: 0.47	Train Win Rate: 4.00%	Test Win Rate: 0.00%	Test Mean Steps: nan


KeyboardInterrupt: 