In [1]:
import os
import json
import numpy as np
import random
from collections import Counter, defaultdict
from tqdm import tqdm
import time
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.dpi'] = 300



import torch
import torch.nn as nn
import torch.optim as optim
from models.transformer_new import *

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myuqinzhou[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Creat Data

In [35]:
SOS_token = 0
EOS_token = 1
pad_idx = 2

class Format:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS", pad_idx: "pad"}
        self.n_words = 3  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1


def readFile(filename):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(filename, encoding='utf-8').read().strip().split('\n')

    # exclude "IN: "
    pairs = [s[4:].split(' OUT: ') for s in lines]

    input_lang = Format("input")
    output_lang = Format("output")

    return input_lang, output_lang, pairs
  
def prepareData(filename):
    input_lang, output_lang, pairs = readFile(filename)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


# obtain word indices in a sentence
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


# transform to tensor format and add a special token
def tensorFromSentence_input(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    # indexes.insert(0, SOS_token)
    # indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromSentence_output(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.insert(0, SOS_token)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# for input and target (one-hot vectors)
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence_input(input_lang, pair[0])
    target_tensor = tensorFromSentence_output(output_lang, pair[1])
    return (input_tensor, target_tensor)


def calculate_mean_std(acc_dict):
    mean = []
    error = []
    keys = sorted(acc_dict[0])
    num_runs = len(acc_dict)
    
    for key in keys:
        t = []
        for d in acc_dict:
            t.append(d[key])
        mean.append(np.mean(t))
        error.append(np.std(t) / np.sqrt(num_runs))
    return np.array(mean), np.array(error), keys


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [36]:
%pwd

'/Users/zhouyuqin/Desktop/ATNLP/transformer_scan'

In [42]:
# input_lang, output_lang, pairs = prepareData("./data/scan/simple_split/tasks_train_simple.txt")
# input_lang_test, output_lang_test, pairs_test = prepareData("./data/scan/simple_split/tasks_test_simple.txt")

# input_lang, output_lang, pairs = prepareData("./data/scan/simple_split/size_variations/tasks_train_simple_p16.txt")
# input_lang_test, output_lang_test, pairs_test = prepareData("./data/scan/simple_split/size_variations/tasks_test_simple_p16.txt")


# input_lang, output_lang, pairs = prepareData("/Users/zhouyuqin/Desktop/ATNLP/SCAN/add_prim_split/tasks_train_addprim_jump.txt")
# input_lang_test, output_lang_test, pairs_test = prepareData("/Users/zhouyuqin/Desktop/ATNLP/SCAN/add_prim_split/tasks_test_addprim_jump.txt")



input_lang, output_lang, pairs = prepareData("/Users/zhouyuqin/Desktop/ATNLP/SCAN/length_split/tasks_train_length.txt")
input_lang_test, output_lang_test, pairs_test = prepareData("/Users/zhouyuqin/Desktop/ATNLP/SCAN/length_split/tasks_test_length.txt")



print(random.choice(pairs))
print(len(pairs), type(pairs))

Reading lines...
Read 16990 sentence pairs
Counting words...
Counted words:
input 16
output 9
Reading lines...
Read 3920 sentence pairs
Counting words...
Counted words:
input 16
output 9
['turn opposite right twice and look opposite right twice', 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_TURN_RIGHT I_LOOK']
16990 <class 'list'>


In [44]:
training_pairs = [tensorsFromPair(pair) for pair in pairs]
test_pairs = [tensorsFromPair(pair) for pair in pairs_test]

In [45]:
batch = training_pairs[4]

In [48]:
for i in batch[0]:
    print(input_lang.index2word[i.item()], end = ' ')

run 

In [49]:
for i in batch[1]:
    print(output_lang.index2word[i.item()], end = ' ')

SOS I_RUN EOS 

### Hyperparameters

In [50]:
class Lang:
  ''' 
   ### Training and Data ###
    num_runs: Number of runs to do.
    num_epochs: Number of training epochs
    
    ### Models ### 
    d_model: Dimension of inputs/outputs in transformer
    nhead: Number of heads in transformer multihead attention
    num_encoder_layers: Number of layers in transformer encoder
    num_decoder_layers: Number of layers in transformer decoder
    dim_feedforward: Dimension of feedforward layers in transformer
    dropout: Dropout rate
              
    ### Optimization ### 
    learning_rate: Fixed learning rate for Adam optimizer


    ### Output options ###
    results_dir: Results subdirectory to save results
    out_data_file: Name of output data file with training loss data
    checkpoint_path: Path to output saved weights.
    checkpoint_every: Epochs before evaluating model and saving weights
    record_loss_every: iters before printing and recording loss
  '''

  def __init__(self, num_runs = 1, num_epochs = 2, d_model = 32, nhead = 8, num_decoder_layers = 2, 
               num_encoder_layers = 2,  d_feedforward = 256, learning_rate = 0.0005,
               dropout = 0.1, checkpoint_every = 1, record_loss_every = 1000, clip = 5, teacher_forcing_ratio = 0):
    
    self.num_runs = num_runs
    self.num_epochs = num_epochs
    self.d_model = d_model
    self.nhead = nhead
    self.num_encoder_layers = num_encoder_layers
    self.num_decoder_layers = num_decoder_layers
    self.d_feedforward = d_feedforward
    self.learning_rate = learning_rate
    self.dropout = dropout
    self.checkpoint_every = checkpoint_every
    self.record_loss_every = record_loss_every
    self.clip = clip
    self.teacher_forcing_ratio = teacher_forcing_ratio

In [51]:
args = Lang()

In [52]:
def train(args):
    model = Seq2Seq(Encoder(input_lang.n_words, 
                        args.d_model, 
                        args.num_encoder_layers, 
                        args.nhead, 
                        args.d_feedforward, 
                        args.dropout, 
                        device), 
                Decoder(output_lang.n_words, 
                        args.d_model, 
                        args.num_decoder_layers, 
                        args.nhead, 
                        args.d_feedforward, 
                        args.dropout, 
                        device), pad_idx, pad_idx, device).to(device)
                        
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr = args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(args.num_epochs):
        start_time = time.time()
        epoch_loss = 0
        print_loss_total = 0 

        for iter, batch in enumerate(training_pairs):
            loss = 0
            optimizer.zero_grad()

            ### Encoder ###
            src = batch[0].T
            src_mask = model.make_src_mask(src)
            enc_src = model.encoder(src, src_mask)
            
            ### Decoder ### 
            trg = batch[1].T
            trg_in = trg[:,:-1] ##[<SOS>, y_1, ..., y_2] 
            trg_out = trg[:,1:] ##[y_1,..., y_2, <EOS>]
            
            use_teacher_forcing = True if random.random() < args.teacher_forcing_ratio else False
            
            if not use_teacher_forcing:
                trg_mask = model.make_trg_mask(trg_in)
                output, _ = model.decoder(trg_in, enc_src, trg_mask, src_mask)       
                loss = criterion(output[0], trg_out.view(-1))    
                trg_indexes =  output[0].argmax(1)
            
            else:
                trg_indexes = [SOS_token]
                for i in range(trg_in.shape[1]):
                    trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
                    trg_mask = model.make_trg_mask(trg_tensor)
                    output, _ = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
                    # only consider the prediction at time t 
                    output = output[0][i]
                    
                    ### gready search
                    pred_token = output.argmax(0).item()
                    trg_indexes.append(pred_token)
                    
                    ## calculate loss
                    loss += criterion(output, trg_out[0][i])

                    # break if the new term is <EOS> 
                    if pred_token == EOS_token:
                        break

                trg_indexes = trg_indexes[1:]
                loss = loss / trg_in.shape[1]

            
            loss.backward()
            wandb.log({"train/loss": loss.data.item(), "train/iter": iter})
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
            
            # Record loss
            print_loss_total += loss.data.item()
            epoch_loss += loss.data.item()
            if (iter + 1) % args.record_loss_every == 0:
                print_loss_avg = print_loss_total / args.record_loss_every
                print(f'Epoch: {epoch} | Iter: {iter} | Loss: {print_loss_avg:.3f} \n Target {trg_out[0]} \n Predict {trg_indexes} type: {use_teacher_forcing}') 
                print_loss_total = 0 
       
        ### Recode loss
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        epoch_loss_avg = epoch_loss / len(training_pairs)

        wandb.log({"train/epoch": epoch, "train/epoch_loss": epoch_loss_avg})
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {epoch_loss_avg:.3f}')

wandb

In [1]:
parameters_dict = {
    'dropout': {
          'values': [0.1]
        },
    'head': {
          'values': [6, 8, 10]
        },
    'layer': {
          'values': [1, 2]},
          
    'd_feedforward': {
          'values': [128, 256, 512]}
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config = {
    'name': "len_split_hyper_test",
    'method': 'grid'
    }

sweep_config['parameters'] = parameters_dict
sweep_config['metric'] = metric

sweep_config

{'name': 'len_split_hyper_test',
 'method': 'grid',
 'parameters': {'dropout': {'values': [0.1]},
  'head': {'values': [6, 8, 10]},
  'layer': {'values': [1, 2]},
  'd_feedforward': {'values': [128, 256, 512]}},
 'metric': {'name': 'loss', 'goal': 'minimize'}}

In [55]:
sweep_id = wandb.sweep(sweep_config, project="ATNLP")

Create sweep with ID: gulveqfe
Sweep URL: https://wandb.ai/yuqinzhou/ATNLP/sweeps/gulveqfe


In [56]:
def train_sweep(config=None):
    with wandb.init(config=config):
        config = wandb.config
        args = Lang(dropout = config.dropout,
                    nhead = config.head,
                    d_model = config.head * 9,
                    num_encoder_layers = config.layer,
                    num_decoder_layers = config.layer,
                    d_feedforward = config.d_feedforward)
                    
        train(args)

In [57]:
wandb.agent(sweep_id, train_sweep)

[34m[1mwandb[0m: Agent Starting Run: qhn8f2hf with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.249 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 1.037 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 1, 8, 1, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.916 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 3, 6, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.908 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 6, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.887 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 4, 4, 4]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.758 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.809 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 3, 8, 3, 8, 3, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.790 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.028 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.036908…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,█▇▅▃▆▅▃▅▆▄▄▂▅▃▃▃▃▃▅▃▅▄▅▅▄▆▃▃▂▅▃▂▃▂▂▁▃▃▃▃

0,1
train/epoch,1.0
train/epoch_loss,0.60135
train/iter,16989.0
train/loss,0.03674


[34m[1mwandb[0m: Agent Starting Run: uf9tk9wd with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.084 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.872 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.784 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.765 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 5, 1, 6, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.746 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([4, 8, 8, 4, 4, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.679 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.711 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 6, 8, 3, 8, 8, 1, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.689 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇▄▅▃█▅▃▆▅▄▄▂▄▅▂▃▃▄▃▃▆▃▃▃▃▆▃▂▆▃▄▂▂▂▃▁▃▃▂▃

0,1
train/epoch,1.0
train/epoch_loss,0.43969
train/iter,16989.0
train/loss,0.22906


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6gsq21qi with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.183 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 1, 5, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 1.017 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.928 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 3, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.910 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 1, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.912 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 1, 8, 4, 8, 4, 8, 4, 8]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.806 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 8, 4, 6, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.823 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 3, 8, 3, 8, 3, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.786 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▆▇▄▃▆▆▄▆█▃▃▂▆█▃▅▃▄▅▄▇▄▆▆▅▄▅▄▂▃▃▃▄▄▄▁▃▅▂▅

0,1
train/epoch,1.0
train/epoch_loss,0.63712
train/iter,16989.0
train/loss,0.14383


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 318r0l2f with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.024 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.842 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 7, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.761 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.756 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 5, 5, 6, 5, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.735 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.686 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.687 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 3, 6, 3, 1, 8, 3, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.643 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▅▄▃▂█▅▃▅▇▅▃▂▃▃▃▃▂▂▃▃▄▄▆▃▃▃▄▁▃▃▃▂▂▂▁▁▂▄▂▂

0,1
train/epoch,1.0
train/epoch_loss,0.42357
train/iter,16989.0
train/loss,0.00882


[34m[1mwandb[0m: Agent Starting Run: py5jbhpl with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.132 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.997 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 7, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.928 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.896 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 6, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.908 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 8]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.765 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 8, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.827 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 6, 8, 6, 8, 6, 8, 8, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.803 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇▇▄▂█▄▆▆▆▅▂▃▅▆▄▄▆▅▄▃▇▆▆▆█▄▅▅▂▃▄▂▃▃▂▁▄▃▄▃

0,1
train/epoch,1.0
train/epoch_loss,0.64466
train/iter,16989.0
train/loss,0.05762


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8fwh9c8k with config:
[34m[1mwandb[0m: 	d_feedforward: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 0.990 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.849 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.760 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.757 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 5, 6, 6, 1, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.748 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.661 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.696 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 3, 8, 6, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.637 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▄▄▂▁▅▄▂▄▆▃▂▂▃▂▂▄▂▂▂▂▂▂▃▃▃▅▄▂▃▃▂▂▂▂▁▁▂█▂▂

0,1
train/epoch,1.0
train/epoch_loss,0.41793
train/iter,16989.0
train/loss,0.01424


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: azosibd2 with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.226 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 1.017 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.946 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.886 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 1, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.859 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 4, 4, 4]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.775 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.799 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 6, 8, 6, 8, 3, 8, 3, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.778 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇█▄▂█▃▂▆▅▅▃▂▅▂▃▃▃▄▃▃▆▄▃▄▅▃▄▃▂▃▄▂▃▂▃▁▃▅▂▄

0,1
train/epoch,1.0
train/epoch_loss,0.60549
train/iter,16989.0
train/loss,0.06491


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5exks045 with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.061 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.884 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.764 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.748 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 5, 6, 6, 6, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.722 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.658 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.694 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([6, 6, 8, 6, 8, 3, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.611 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.076790…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,█▆▂▂█▄▃▅▇▄▃▃▄▄▃▂▂▂▅▃▄▄▄▃▃▃▃▂▃▄▃▂▄▁▁▁▂▃▁▁

0,1
train/epoch,1.0
train/epoch_loss,0.39995
train/iter,16989.0
train/loss,0.02154


[34m[1mwandb[0m: Agent Starting Run: zlaowr40 with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.158 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.993 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.881 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 6]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.874 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 5, 5, 6, 6, 6, 5, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.848 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 4, 4, 8]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.744 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.792 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 6, 8, 6, 8, 6, 8, 8, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.759 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.028 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.036868…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇█▄▂█▅▂▆▆▄▂▂▅▄▃▄▃▆▅▄▄▄▄▃▄▃▅▆▄▂▃▃▃▁▃▁▂▅▃▄

0,1
train/epoch,1.0
train/epoch_loss,0.59164
train/iter,16989.0
train/loss,0.0425


[34m[1mwandb[0m: Agent Starting Run: n7hv4hop with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.028 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.865 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.754 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.749 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 6, 6, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.719 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.658 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.686 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 6, 8, 3, 8, 6, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.632 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▅▅▂▁█▃▂▅▇▃▂▂▅▂▃▃▂▂▂▂▇▄▄▃▄▂▂▂▂▄▂▂▂▁▁▁▃▂▃▂

0,1
train/epoch,1.0
train/epoch_loss,0.40875
train/iter,16989.0
train/loss,0.01329


[34m[1mwandb[0m: Agent Starting Run: ihqr50kr with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.134 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.983 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.919 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 6]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.888 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 5, 5, 5, 6, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.856 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.768 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 6, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.804 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 6, 8, 6, 8, 3, 6, 3, 3, 3]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.754 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▃▄▂▂▄▄▂▃▃▂▁▁▃▃▂▂▂▃▃▂█▃▂▃▅▅▃▁▃▂▂▂▂▂▁▂▂▃▁▂

0,1
train/epoch,1.0
train/epoch_loss,0.61944
train/iter,16989.0
train/loss,0.02765


[34m[1mwandb[0m: Agent Starting Run: l59ws02y with config:
[34m[1mwandb[0m: 	d_feedforward: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 0.990 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.845 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.762 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.744 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 5, 5, 6, 6, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.732 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([4, 8, 8, 4, 8, 4, 8, 4, 4]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.672 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 8, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.676 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 6, 6, 8, 3, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.622 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇█▂▃█▅▃▅▅▃▃▃▅▃▃▃▃▂▂▂▄▂▅▄▄▃▇▃▂▃▃▂▃▂▁▁▁▅▂▂

0,1
train/epoch,1.0
train/epoch_loss,0.41235
train/iter,16989.0
train/loss,0.03713


[34m[1mwandb[0m: Agent Starting Run: 83xp6gxd with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.166 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.965 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 1, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.882 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.846 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 5, 5, 6, 6, 1, 5, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.848 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.751 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 8, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.799 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 6, 6, 8, 3, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.732 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.028 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.036870…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,█▇▂▂▆▅▂▅▆▄▂▂▄▃▃▄▂▃▅▄▃▄▄▇▄▆▃▁▂▃▃▂▂▃▁▁▂▄▁▂

0,1
train/epoch,1.0
train/epoch_loss,0.54882
train/iter,16989.0
train/loss,0.03324


[34m[1mwandb[0m: Agent Starting Run: ex782oov with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 6
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.090 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 3, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.886 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.776 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.740 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 6, 1, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.724 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.649 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 6, 4, 8, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.669 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([6, 3, 8, 3, 8, 3, 8, 3, 1, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.620 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▆█▄▁▆▅▃▂▇▄▂▁▃▁▂▃▂▄▂▃▄▂▃▅▃▅▂▃▃▄▃▄▂▂▁▁▁▃▁▃

0,1
train/epoch,1.0
train/epoch_loss,0.40167
train/iter,16989.0
train/loss,0.0122


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: sznpz97n with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.167 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 3, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.988 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.904 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.872 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 5, 6, 6, 6, 6, 6, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.875 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 1, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.740 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.808 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 3, 8, 3, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.752 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▅▅▂▂█▄▂▃▅▃▂▂▃▃▃▂▂▅▃▃▅▃▃▆▃▃▃▂▂▂▂▂▂▂▂▁▂▃▂▃

0,1
train/epoch,1.0
train/epoch_loss,0.61002
train/iter,16989.0
train/loss,0.09851


[34m[1mwandb[0m: Agent Starting Run: liing6my with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 8
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.034 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 3, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.857 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.773 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.739 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 5, 5, 6, 5, 1]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.721 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 4, 4, 4]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.681 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.676 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([8, 3, 8, 6, 6, 3, 6, 8, 8, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.636 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▆▅▂▁▇▄▃▆▅▄▃▂▃▃▃▃▄▂▃▂█▄▅▆▃▂▃▂▂▃▃▃▃▃▂▁▁▃▃▃

0,1
train/epoch,1.0
train/epoch_loss,0.41179
train/iter,16989.0
train/loss,0.05188


[34m[1mwandb[0m: Agent Starting Run: adb3mrty with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.136 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 5, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.980 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 1, 8]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.887 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.817 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 5, 6, 5, 6, 6, 5, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.865 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.742 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 4, 8, 4, 6, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.792 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([6, 6, 8, 3, 8, 6, 8, 3, 3, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.773 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.028 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.036870…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,▇▅▃▁▆▄▂█▆▄▁▂▅▃▄▄▄▄▅▃▅▇▃▅▅▂▄▃▂▄▄▅▃▃▃▂▂▃▁▄

0,1
train/epoch,1.0
train/epoch_loss,0.60233
train/iter,16989.0
train/loss,0.11564


[34m[1mwandb[0m: Agent Starting Run: sj3kzxq6 with config:
[34m[1mwandb[0m: 	d_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	head: 10
[34m[1mwandb[0m: 	layer: 2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Epoch: 0 | Iter: 999 | Loss: 1.024 
 Target tensor([5, 8, 8, 3, 1]) 
 Predict tensor([8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 1999 | Loss: 0.855 
 Target tensor([7, 8, 8, 8, 8, 1]) 
 Predict tensor([8, 8, 8, 8, 8, 1]) type: False
Epoch: 0 | Iter: 2999 | Loss: 0.735 
 Target tensor([6, 3, 6, 3, 6, 4, 1]) 
 Predict tensor([6, 6, 6, 4, 6, 4, 1]) type: False
Epoch: 0 | Iter: 3999 | Loss: 0.728 
 Target tensor([6, 6, 6, 6, 5, 6, 5, 1]) 
 Predict tensor([6, 6, 6, 6, 6, 6, 5, 6]) type: False
Epoch: 0 | Iter: 4999 | Loss: 0.710 
 Target tensor([4, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([8, 8, 8, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 5999 | Loss: 0.640 
 Target tensor([6, 4, 8, 4, 8, 4, 8, 4, 1]) 
 Predict tensor([6, 4, 6, 4, 8, 4, 8, 4, 1]) type: False
Epoch: 0 | Iter: 6999 | Loss: 0.647 
 Target tensor([6, 3, 6, 3, 6, 3, 8, 8, 8, 1]) 
 Predict tensor([6, 3, 8, 3, 6, 3, 8, 7, 1, 1]) type: False
Epoch: 0 | Iter: 7999 | Loss: 0.592 
 Target tensor([8, 8, 5, 8, 8, 5, 8, 8, 5, 7,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁█
train/epoch_loss,█▁
train/iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
train/loss,█▅▄▁▇▃▂▇▅▃▂▂▄▂▂▃▂▃▂▂▂▃▄▄▃▄▄▂▁▅▂▃▃▂▃▁▂▃▃▂

0,1
train/epoch,1.0
train/epoch_loss,0.3983
train/iter,16989.0
train/loss,0.13809


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


### Evaluation

In [206]:
torch.save(model.state_dict(), 'simple-10.pt')

In [207]:
model.load_state_dict(torch.load('simple-10.pt'))

<All keys matched successfully>

In [208]:
def translate_sentence(batch, model, max_len = 50):
    model.eval()
    src = batch[0].T
    src_mask = model.make_src_mask(src)
    with torch.no_grad():
        enc_src = model.encoder(src, src_mask)

    #
    trg_indexes = [SOS_token]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        output = output[0][i]
        pred_token = output.argmax(0).item()
        trg_indexes.append(pred_token)
        #
        if pred_token == EOS_token:
            break
    return trg_indexes[1:], attention

def test_accuracy(data, model):
    all_correct_trials = 0
    for i, batch in enumerate(data):
        trg = batch[1].T
        trg_out = trg[:,1:] ##[y_1,..., y_2, <EOS>]
        index, _  = translate_sentence(batch, model)

        correct = trg_out[0].tolist() == index
        all_correct_trials += correct
        
        if (i+1)  % 100 == 0:
            print(i, all_correct_trials/ i)
            
    return all_correct_trials / len(data)

In [210]:
test_accuracy(test_pairs, model)

99 0.7676767676767676
199 0.7386934673366834
299 0.7658862876254181
399 0.7719298245614035
499 0.7775551102204409
599 0.7779632721202003


KeyboardInterrupt: 

In [213]:
def test_teacher(data, model):
    model.eval()
    with torch.no_grad():
        all_correct_trials = [] # list of booleans indicating whether correct
        
        for batch in data:
            src = batch[0].T
            trg = batch[1].T
            out, _ = model(src, trg[:,:-1])

            preds = torch.argmax(out, dim = 2)
            correct_pred = preds == trg[:,1:]

            correct_pred = correct_pred.cpu().numpy()
            correct = correct_pred.all(0).tolist()
            all_correct_trials += correct

    accuracy = np.mean(all_correct_trials)
    model.train()
    return accuracy

test_teacher(test_pairs, model)

0.9854706749004734

### Visualization

In [None]:
def train_teacher(run, args):
    model = Seq2Seq(enc, dec, pad_idx, pad_idx, device).to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr = args.learning_rate)

    criterion = nn.CrossEntropyLoss()

    for epoch in range(args.num_epochs):
        start_time = time.time()
        epoch_loss = 0

        for iter, batch in enumerate(training_pairs):
            src = batch[0].T
            trg = batch[1].T
            
            optimizer.zero_grad()
            
            output, _ = model(src, trg[:,:-1]) ##[<SOS>, y_1, y_2]
                    
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
                
            output_dim = output.shape[-1]
                
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1) ##[y_1, y_2, <EOS>]
                    
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
                
            loss = criterion(output, trg)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
            epoch_loss += loss.item()
            # Record loss
            if iter % args.record_loss_every == 0:
                loss_datapoint = loss.data.item()
                print('Run:', run,
                        'Iter:', iter,
                        'Loss:', loss_datapoint)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        test_acc = test_teacher(model, test_pairs, device)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {epoch_loss / len(training_pairs):.3f} | Test accurac （teacher forcing): {test_acc}')