In [16]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [17]:
from model_arch.run_train import *
from utils.step_sample import create_named_schedule_sampler
from model_arch.train import TrainLoop
from utils.data import load_data_text
from model_arch.tokenizer import load_tokenizer, load_model_emb
from model_arch.sampling import sampling

from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast, set_seed
import json, torch
from utils import dist_util
from functools import partial
import pickle
import random
from datetime import datetime

In [18]:
dist_util.clear_cache()

In [19]:
lr=0.00005
batch_size=30
val_batch_size=20
microbatch=10
epochs=1_000
eval_interval=10
ema_rate='0.9999' 
schedule_sampler='uniform'
diffusion_steps=1000
noise_schedule='sqrt'
vocab='custom'
use_plm_init='no' # embedding in transformer
vocab_size=0
config_name='bert-base-uncased'
seq_len=128
hidden_t_dim=128
hidden_dim=128
dropout=0.1
seed=10275679
weight_decay=0.0
predict_xstart=True
rescale_timesteps=True
emb_scale_factor=1.0

In [20]:
regular_data_dir='data'
data_player_dir='data/with_player'
comedies_data_dir='data/comedies_only'

# set the data directory
data_dir=regular_data_dir

In [21]:
set_seed(seed)

In [22]:
tokenizer = load_tokenizer('shakespeare', config_name)

shakespeare


In [23]:
model_weight, tokenizer = load_model_emb(hidden_dim, tokenizer)

In [24]:
model_weight

Embedding(30268, 128)

In [25]:
## very very important to set this!!!!!
vocab_size = tokenizer.vocab_size
vocab_size

30268

In [26]:
data = load_data_text(
        batch_size=batch_size,
        seq_len=seq_len,
        data_dir=data_dir,
        loaded_vocab=tokenizer,
        model_emb=model_weight # use model's weights as init
    )

val = load_data_text(
        batch_size=val_batch_size,
        seq_len=seq_len,
        data_dir=data_dir,
        loaded_vocab=tokenizer,
        split='valid',
        model_emb=model_weight, # use model's weights as init
    )

############################## 
Loading text data...
############################## 
Loading dataset from data...
### Loading form the TRAIN set...
### Data samples...
 ['this is the very false gallop of verses why do you infect yourself with them?', 'and fair men call for grace. aaron will have his soul black like his face.'] ['peace, you dull fool! i found them on a tree.', 'o, here i lift this one hand up to heaven, and bow this feeble ruin to the earth if any power pities wretched tears, to that i call! what, wilt thou kneel with me? do, then, dear heart,']
RAM used: 2075.33 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_rows: 32531
})
RAM used: 2099.91 MB


Running tokenizer on dataset (num_proc=4):   0%|          | 0/32531 [00:00<?, ? examples/s]

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 32531
})
### tokenized_datasets...example [2, 138, 121, 78, 476, 880, 9871, 94, 5833, 329, 144, 89, 2960, 955, 131, 266, 22, 3]
RAM used: 2108.81 MB


merge and mask:   0%|          | 0/32531 [00:00<?, ? examples/s]

RAM used: 2138.84 MB


padding:   0%|          | 0/32531 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 32531
}) padded dataset
RAM used: 2200.36 MB
RAM used: 2200.36 MB
############################## 
Loading text data...
############################## 
Loading dataset from data...
### Loading form the VALID set...
### Data samples...
 ["kind o' thing than a fool and yet i would not be thee, nuncle, thou hast pared thy wit o' both sides, and left nothing i' the middle here comes one o' the parings.", 'yes, by saint patrick, but there is, horatio, and much offence too. touching this vision here, it is an honest ghost, that let me tell you for your desire to know what is between us,'] ["how now, daughter! what makes that frontlet on? methinks you are too much of late i' the frown.", "o'ermaster 't as you may. and now, good friends, as you are friends, scholars and soldiers, give me one poor request."]
RAM used: 2107.02 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_row

Running tokenizer on dataset (num_proc=4):   0%|          | 0/8123 [00:00<?, ? examples/s]

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 8123
})
### tokenized_datasets...example [2, 714, 37, 9, 749, 243, 23, 550, 85, 317, 31, 241, 125, 100, 208, 10, 6206, 10, 136, 469, 23187, 180, 646, 37, 9, 579, 3437, 10, 85, 1044, 549, 31, 9, 78, 5789, 237, 571, 295, 37, 9, 78, 14784, 12, 3]
RAM used: 2107.82 MB


merge and mask:   0%|          | 0/8123 [00:00<?, ? examples/s]

RAM used: 2117.87 MB


padding:   0%|          | 0/8123 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 8123
}) padded dataset
RAM used: 2127.97 MB
RAM used: 2127.97 MB


In [27]:
diffusion = SpacedDiffusion(
    betas=get_named_beta_schedule(noise_schedule, diffusion_steps),
    rescale_timesteps=rescale_timesteps,
    predict_xstart=predict_xstart,
)

best_model_fp = 'models/final/final_comedies_only_model_df1000.pkl'
with open(best_model_fp, 'rb') as handle:
    model = pickle.load(handle)

In [28]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

91192508

In [31]:
schedule_sampler = create_named_schedule_sampler('uniform', diffusion)

model.to(dist_util.dev())

train_loss, val_loss = TrainLoop(
                            model=model,
                            diffusion=diffusion,
                            data=data,
                            batch_size=batch_size,
                            microbatch=microbatch,
                            lr=lr,
                            ema_rate=ema_rate,
                            schedule_sampler=schedule_sampler,
                            weight_decay=weight_decay,
                            epochs=epochs,
                            eval_data=val,
                            eval_interval=eval_interval,
                            warm_up_steps=50,
                            use_llrd=True,
                            llrd_rate=0.9
                        ).run_loop()

dt = datetime.now().strftime("%m%d")
pickle.dump(model, open(f"models/{dt}/final_model_df{diffusion_steps}.pkl", 'wb'))





name: word_embedding.weight, lr: 5e-05
name: lm_head.bias, lr: 5e-05
name: time_embed.0.weight, lr: 5e-05
name: time_embed.0.bias, lr: 5e-05
name: time_embed.2.weight, lr: 5e-05
name: time_embed.2.bias, lr: 5e-05
name: input_up_proj.0.weight, lr: 5e-05
name: input_up_proj.0.bias, lr: 5e-05
name: input_up_proj.2.weight, lr: 5e-05
name: input_up_proj.2.bias, lr: 5e-05
name: input_transformers.layer.0.attention.self.query.weight, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.self.query.bias, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.self.key.weight, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.self.key.bias, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.self.value.weight, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.self.value.bias, lr: 5.555555555555556e-05
name: input_transformers.layer.0.attention.output.dense.weight, lr: 5.555555555555556e-05
name: input_transfo

In [None]:
# plot(train_loss, val_loss)

In [None]:
# param_names = []
# for i, (name, param) in enumerate(model.named_parameters()):
#     param_names.append(name)
#     print(f'{i}: {name} {param.requires_grad}')

# Generating sequences

In [40]:
best_model_fp = f'models/0314/model_best_epoch_280_min_val_loss_0.04129999876022339.pkl'
with open(best_model_fp, 'rb') as handle:
    best_model = pickle.load(handle)

In [41]:
diffusion = SpacedDiffusion(
    betas=get_named_beta_schedule(noise_schedule, 1000),
    rescale_timesteps=rescale_timesteps,
    predict_xstart=predict_xstart,
)

In [51]:
word_lst_source, word_lst_recover, word_lst_ref, inter_lst_recover = sampling(best_model, 
                                                           diffusion, 
                                                           data_dir=regular_data_dir, 
                                                           batch_size=10, 
                                                           split='test_custom', 
                                                           seq_len=128, 
                                                           show_intermediate_results=False)

############################## 
Loading text data...
############################## 
Loading dataset from data...
### Loading from the custom TEST set...
### Data samples...
 ["What dost thou have planned for the morrow's reprieve?", 'What is your will?'] ['', '']
RAM used: 4390.97 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_rows: 4
})
RAM used: 4390.97 MB


Running tokenizer on dataset (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 4
})
### tokenized_datasets...example [2, 164, 757, 136, 150, 5402, 7020, 115, 78, 835, 9, 41, 10968, 22, 3]
RAM used: 4391.04 MB


merge and mask:   0%|          | 0/4 [00:00<?, ? examples/s]

RAM used: 4391.05 MB


padding:   0%|          | 0/4 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 4
}) padded dataset
RAM used: 4391.09 MB
RAM used: 4391.09 MB
### End of reading iteration...


  0%|          | 0/1000 [00:00<?, ?it/s]

Generating 20 sentences takes 5 minutes

In [52]:
word_lst_source

["[CLS] what dost thou have planned for the morrow's reprieve? [SEP] [SEP]",
 '[CLS] what is your will? [SEP] [SEP]',
 "[CLS] my lord,'tis the pondering of life's meaning that doth occupy my thoughts most gravely. [SEP] [SEP]",
 '[CLS] call her forth to me [SEP] [SEP]']

In [53]:
word_lst_recover

['[CLS] ay, [SEP]',
 '[CLS] the [SEP]',
 "[CLS]'tis'to the rest to. [SEP]",
 "[CLS] not, ', to say's good once me. [SEP]"]