In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda:0'

cache_dir = "/data4/yoomcache"
model_cache_dir = os.path.join(cache_dir, 'huggingface')
data_cache_dir = os.path.join(cache_dir, 'datasets')
checkpoint_dir = os.path.join(cache_dir, 'checkpoint')

import torch
from datasets import load_dataset, load_metric
import math
from itertools import groupby

import wandb
wandb.init(project="testing-wav2vec2gpt", entity="yoom-private")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myoom-private[0m (use `wandb login --relogin` to force relogin)


In [2]:
# %reload_ext autoreload
# %autoreload 2
from wav2vec2GPTwCTC import *
from configuration_wav2vec2gpt import Wav2Vec2GPTConfig

from transformers import Wav2Vec2FeatureExtractor
from transformers import GPT2Tokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [3]:
wav2vec_pretrained = "facebook/wav2vec2-base"
gpt_pretrained = "gpt2"

In [4]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_pretrained, 
                                              cache_dir=model_cache_dir)

tokenizer = GPT2Tokenizer.from_pretrained(gpt_pretrained,
                                          cache_dir=model_cache_dir)
tokenizer.bos_token = '<|endoftext|>'
tokenizer.pad_token = 'Ġ'
tokenizer.unk_token = 'Ġ'
tokenizer.eos_token = '<|endoftext|>'

In [5]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", 
                       split="validation", 
                       cache_dir=data_cache_dir
                      )

dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

dataset, sampling_rate

Reusing dataset librispeech_asr (/data4/yoomcache/datasets/hf-internal-testing___librispeech_asr/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)
Loading cached sorted indices for dataset at /data4/yoomcache/datasets/hf-internal-testing___librispeech_asr/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b/cache-2f7c0cbee6ef3aa1.arrow


(Dataset({
     features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
     num_rows: 73
 }),
 16000)

In [6]:
audio_inputs = [d["audio"]["array"] for d in dataset]
len(audio_inputs)

73

In [7]:
# text_inputs = dataset["text"]
# text_inputs

In [8]:
text_inputs = [ # FROM `1272_128104.book.tsv`
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
"Nor is Mr. Quilter's manner less interesting than his matter.",
"He tells us that at this festive season of the year, with Christmas and roast beef looming before us, 'Similes drawn from eating and its results occur most readily to the mind.'",
"He has grave doubts whether Sir Frederick Leighton's work is really 'Greek, after all,' and can discover in it but little of 'rocky Ithaca.'",
"Linnell's pictures, are 'a sort of \"Up, Guards, and at 'em\" paintings,' and Mason's exquisite idylls are 'as national as a Jingo poem'! Mr. Birket Foster's landscapes 'smile at one much in the same way that Mr. Carker used to \"flash his teeth,\"' and Mr. John Collier gives his sitter 'a cheerful slap on the back, before he says, like a shampooer in a Turkish bath, \"Next man!\"",
"It is obviously unnecessary for us to point out how luminous these criticisms are, how delicate in expression.",
"On the general principles of art Mr. Quilter writes with equal lucidity.",
"Painting, he tells us, is 'of a different quality to mathematics,' and finish in art is 'adding more fact'!",
"As for etchings, they are of two kinds--British and foreign.",
"He laments most bitterly the divorce that has been made between decorative art and 'what we usually call \"pictures,\"' makes the customary appeal to the Last Judgment, and reminds us that in the great days of art Michael Angelo was the 'furnishing upholsterer.'",
"near the fire, and the ornaments Fred brought home from India on the mantel-board'!",
"In fact, he is quite severe on Mr. Ruskin for not recognising that 'a picture should denote the frailty of man,' and remarks with pleasing courtesy and felicitous grace that 'many phases of feeling . . . ",
"Only, unfortunately, his own work never does get good.",
"Mr. Quilter has missed his chance; for he has failed even to make himself the Tupper of Painting.",
"By Harry Quilter, M.A.",
# FROM `1272_135031.book.tsv`
"\"Because you were sleeping instead of conquering, the lovely Rose Princess has become a fiddle without a bow, while poor Shaggy sits there a cooing dove!\"",
"\"He has gone, and gone for good,\" answered Polychrome, who had managed to squeeze into the room beside the dragon and had witnessed the occurrences with much interest.",
'"I have remained a prisoner only because I wished to be one," and with this he stepped forward and burst the stout chains as easily as if they had been threads.',
"The little girl had been asleep, but she heard the raps and opened the door.",
'"The King has fled in disgrace and your friends are asking for you."',
'"I begged Ruggedo long ago to send him away, but he would not do so."',
'"I also offered to help your brother to escape, but he would not go."',
'"He eats and sleeps very steadily," replied the new King.',
'"I hope he doesn\'t work too hard," said Shaggy.',
'"He doesn\'t work at all."',
"In fact, there is nothing he can do in these dominions as well as our nomes, whose numbers are so great that it worries us to keep them all busy.",
'"Not exactly," returned Kaliko.',
'"Where is my brother now?"',
'inquired Shaggy. "In the Metal Forest."',
'"Where is that?"',
'"The Metal Forest is in the Great Domed Cavern, the largest in all our dominions," replied Kaliko.',
'Kaliko hesitated.',
'"However, if we look sharp, we may be able to discover one of these secret ways."',
'"Oh, no; I\'m quite sure he didn\'t."',
'"That\'s funny," remarked Betsy thoughtfully.',
'"I don\'t believe Ann knew any magic, or she\'d have worked it before."',
'"I do not know," confessed Shaggy.',
'"True," agreed Kaliko.',
'Kaliko went to the big gong and pounded on it just as Ruggedo used to do; but no one answered the summons.',
"Having returned to the royal cavern, Kaliko first pounded the gong and then sat in the throne, wearing Ruggedo's discarded ruby crown and holding in his hand the sceptre which Ruggedo had so often thrown at his head.",
# FROM `1272_141231.book.tsv`
'_A man said to the universe: "Sir, I exist!"',
"Sweat covered Brion's body, trickling into the tight loincloth that was the only garment he wore.",
'The cut on his chest, still dripping blood, the ache of his overstrained eyes--even the soaring arena around him with the thousands of spectators--were trivialities not worth thinking about.',
'His instant of panic was followed by a small sharp blow high on his chest.',
'"One minute," a voice said, and the time buzzer sounded.',
'A minute is not a very large measure of time and his body needed every fraction of it.',
"The buzzer's whirr triggered his muscles into complete relaxation.",
'Only his heart and lungs worked on at a strong, measured rate.',
'He was in reverie, sliding along the borders of consciousness.',
'The contestants in the Twenties needed undisturbed rest, therefore nights in the dormitories were as quiet as death.',
'Particularly so on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark, empty doors.',
'The other voice snapped with a harsh urgency, clearly used to command.',
'"I\'m here because the matter is of utmost importance, and Brandd is the one I must see. Now stand aside!"',
'"The Twenties--"',
'He must have drawn his gun, because the intruder said quickly, "Put that away. You\'re being a fool!"',
'There was silence then and, still wondering, Brion was once more asleep.',
'"Ten seconds."',
'he asked the handler who was kneading his aching muscles.',
'A red-haired mountain of a man, with an apparently inexhaustible store of energy.',
'There could be little art in this last and final round of fencing.',
'Just thrust and parry, and victory to the stronger.',
'Every man who entered the Twenties had his own training tricks.',
'There appeared to be an immediate association with the death-trauma, as if the two were inextricably linked into one.',
'The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels',
'This is physically impossible when conscious.',
'Others had died before during the Twenties, and death during the last round was in some ways easier than defeat.',
'Breathing deeply, Brion softly spoke the auto-hypnotic phrases that triggered the process.',
"When the buzzer sounded he pulled his foil from his second's startled grasp, and ran forward.",
'Irolg looked amazed at the sudden fury of the attack--then smiled.',
'He thought it was a last burst of energy, he knew how close they both were to exhaustion.',
"Brion saw something close to panic on his opponent's face when the man finally recognized his error.",
'A wave of despair rolled out from Irolg--Brion sensed it and knew the fifth point was his.',
'Then the powerful twist that thrust it aside. In and under the guard.']

In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_values, tokenized_output):
        self.input_values = input_values
        self.tokenized_output = tokenized_output

    def __getitem__(self, idx):
        item = dict()
        item['input_values'] = self.input_values['input_values'][idx]
        item['labels'] = self.tokenized_output['input_ids'][idx]
        item['output_attention_mask'] = self.tokenized_output['attention_mask'][idx]
        return item

    def __len__(self):
        return len(self.input_values['input_values'])

    
input_values = feature_extractor(audio_inputs, 
                                      sampling_rate=sampling_rate,
                                      return_tensors="pt",
                                      padding='longest',
                                     )

tokenized_output = tokenizer(text_inputs,
                             return_tensors="pt",
                             padding='longest',
#                              padding='max_length',
#                              max_length=300
                         )

train_dataset = CustomDataset(input_values, tokenized_output)
# val_dataset = CustomDataset(input_values, tokenized_output)
# test_dataset = CustomDataset(input_values, tokenized_output)

In [10]:
# class CustomDataset(torch.utils.data.Dataset):
#     def __init__(self, audio_inputs, text_inputs):
#         self.audio_inputs = audio_inputs
#         self.text_inputs = text_inputs

#     def __getitem__(self, idx):
#         item = dict()
#         item['input_values'] = feature_extractor(self.audio_inputs[idx], 
#                                       sampling_rate=sampling_rate,
#                                       return_tensors="pt",
#                                       padding='longest',
#                                      )['input_values']
#         tokenized_output = tokenizer(self.text_inputs[idx], 
#                                       return_tensors="pt",
# #                                       padding='max_length',
#                                       padding='longest',
#                                      )
#         item['labels'] = tokenized_output['input_ids']
#         item['output_attention_mask'] = tokenized_output['attention_mask']
#         item['output_max_length'] = len(tokenized_output['input_ids'])
#         return item

#     def __len__(self):
#         return len(self.audio_inputs)


# train_dataset = CustomDataset(audio_inputs, text_inputs)
# # val_dataset = CustomDataset(audio_inputs, text_inputs)
# # test_dataset = CustomDataset(audio_inputs, text_inputs)

In [11]:
config = Wav2Vec2GPTConfig()

config.n_positions = tokenized_output['attention_mask'].shape[1] * 2
# config.max_position_embeddings = config.n_positions
# config.ctc_loss_reduction = 'mean'

# change configuration of adapter
config.add_adapter = True
config.adapter_kernel_size = 6
config.adapter_stride = 2
config.num_adapter_layers = 3


model = Wav2Vec2GPTModel(config=config)

model.wav2vec2.from_pretrained(wav2vec_pretrained, cache_dir=model_cache_dir)
model.gpt2lm.from_pretrained(gpt_pretrained, cache_dir=model_cache_dir)


# device_map = {
#     0: [0, 1, 2, 3, 4,],
#     2: [5, 6, 7, 8, 9, 10, 11, ],
# }
# model.gpt2lm.parallelize(device_map)


model.freeze_feature_extractor()
model.freeze_feature_projection()
# model.freeze_wav2vec_encoder() # not exists here
model.unfreeze_wav2vec_adapter()
model.unfreeze_rnn_compressor()
model.freeze_gpt_decoder()
model.unfreeze_lm_head()

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2Model2: ['wav2vec2.encoder.layers.10.final_layer_norm.bias', 'wav2vec2.encoder.layers.6.final_layer_norm.weight', 'project_q.weight', 'wav2vec2.encoder.layers.5.layer_norm.bias', 'wav2vec2.encoder.layers.0.feed_forward.output_dense.weight', 'wav2vec2.encoder.layers.4.attention.k_proj.weight', 'wav2vec2.encoder.layers.10.feed_forward.intermediate_dense.bias', 'wav2vec2.encoder.layers.2.attention.k_proj.weight', 'wav2vec2.encoder.layers.1.layer_norm.bias', 'wav2vec2.encoder.layers.3.feed_forward.intermediate_dense.bias', 'wav2vec2.encoder.layers.5.final_layer_norm.weight', 'wav2vec2.encoder.layers.8.attention.q_proj.bias', 'wav2vec2.encoder.layers.8.final_layer_norm.bias', 'wav2vec2.encoder.layers.9.feed_forward.intermediate_dense.weight', 'wav2vec2.encoder.layers.2.feed_forward.output_dense.bias', 'wav2vec2.encoder.layers.11.final_layer_norm.bias', 'wav2vec2.encoder.layers.6.feed_forwa

In [12]:
count = 0
for p in model.parameters():
    if p.requires_grad:
        count += 1
print(count)

11


In [13]:
model

Wav2Vec2GPTModel(
  (wav2vec2): Wav2Vec2Model2(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=Fal

In [14]:
# load rouge for validation
rouge = load_metric("rouge")
# rouge = load_metric("rouge", experiment_id=1)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = decoder_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = decoder_tokenizer.eos_token_id
    label_str = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [15]:
batch_size = 3
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)


# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
#     predict_with_generate=True,
    output_dir=os.path.join(checkpoint_dir, "wav2vec2gpt/unfreeze-rnn"),
    # do_train=True,
    # do_eval=True,
#     do_predict=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*5,
    learning_rate=1e-4, 
    weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0,
    num_train_epochs=200,
    max_steps=-1,
    # lr_scheduler_type='linear', warmup_ratio=0.0, 
    
    logging_strategy='steps',
    save_strategy='steps',
    evaluation_strategy='steps',
    logging_steps=1 * steps_per_epoch,
    save_steps=2 * steps_per_epoch,
    eval_steps=1 * steps_per_epoch,
    warmup_steps=10 * steps_per_epoch,
    save_total_limit=0,
    overwrite_output_dir=True,
)

In [16]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
#     compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)


# start training
trainer.train()

***** Running training *****
  Num examples = 73
  Num Epochs = 200
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
25,6163.9125,24580.730469
50,4605.8681,13916.893555
75,3029.7309,7259.718262
100,2040.9052,3735.874023
125,1050.4802,3013.709229
150,622.6318,2380.547852
175,440.9529,2026.060547
200,392.9628,1956.558228
225,376.4111,1846.647705
250,378.5265,1878.925537


***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-50
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-50/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-50/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-100
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-100/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num example

Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-950/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-950/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1000
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1000/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1050
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1050/config.json
Mo

***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1900
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1900/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1900/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1950
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1950/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-1950/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Nu

Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2800
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2800/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2850
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2850/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2850/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-2900
Configur

Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3700/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3750
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3750/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3750/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3800
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3800/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-3800/pytorch_mode

  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4650
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4650/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4650/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
Saving model checkpoint to /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4700
Configuration saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4700/config.json
Model weights saved in /data4/yoomcache/checkpoint/wav2vec2gpt/unfreeze-rnn/checkpoint-4700/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 73
  Batch size = 15
***** Running Evaluation *****
  Num examples = 73
  Batch size = 

TrainOutput(global_step=5000, training_loss=170.79922692871094, metrics={'train_runtime': 5258.8699, 'train_samples_per_second': 2.776, 'train_steps_per_second': 0.951, 'total_flos': 4.715741940215014e+18, 'train_loss': 170.79922692871094, 'epoch': 200.0})

In [17]:
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▆▆▇▆▆▇▆▇▇▆▅▇▇█▇▇▁▇▆▆█▇█▆▁▆▇▆▆▇▆▅▁█▆▆▇▆▆▆
eval/samples_per_second,▂▃▂▃▃▂▃▂▂▃▃▂▂▁▂▂█▂▃▃▁▂▁▃█▃▂▃▃▂▃▃█▁▃▃▁▃▃▃
eval/steps_per_second,▂▃▂▃▃▂▃▂▂▂▃▂▂▁▂▂█▂▃▃▁▂▁▃█▃▂▃▃▂▃▃█▁▃▃▁▃▂▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▅███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,64.1329
eval/runtime,6.4218
eval/samples_per_second,11.368
eval/steps_per_second,0.779
train/epoch,200.0
train/global_step,5000.0
train/learning_rate,0.0
train/loss,33.1972
train/total_flos,4.715741940215014e+18
train/train_loss,170.79923


In [18]:
# example


BATCH_SIZE = 8
i = 3


audio_batch = audio_inputs[i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE]
audio_feature_batch = feature_extractor(audio_batch, 
                                      sampling_rate=sampling_rate,
                                      return_tensors="pt",
                                      padding='longest',
                                     ).input_values
print(audio_feature_batch.size())


text_batch = text_inputs[i*BATCH_SIZE:i*BATCH_SIZE+BATCH_SIZE]

text_tokens_batch = tokenizer(text_batch, 
                              return_tensors="pt",
                              padding='max_length',
                              max_length=train_dataset.tokenized_output['input_ids'].shape[1]
                             )
print(text_tokens_batch['attention_mask'].size())

with torch.no_grad():
    audio_embedding = model(input_values=audio_feature_batch.to(device), 
                            labels=text_tokens_batch['input_ids'].to(device),
                            output_attention_mask=text_tokens_batch['attention_mask'].to(device),)
print(audio_embedding.logits.shape)

pred_ids = torch.argmax(audio_embedding.logits, axis=-1)
print(pred_ids.size())
print()

for idx in range(BATCH_SIZE):
    print(text_batch[idx])
    print(tokenizer.decode([key for key, _group in groupby(pred_ids[idx])]))
    print()

torch.Size([8, 143920])
torch.Size([8, 107])
torch.Size([8, 214, 50257])
torch.Size([8, 214])

"He doesn't work at all."
"He doesn't  work at all." 

In fact, there is nothing he can do in these dominions as well as our nomes, whose numbers are so great that it worries us to keep them all busy.
In fact,  there is he can  do in these domin  dominions as ouromes,  whose numbers are  so great that  it

"Not exactly," returned Kaliko.
"Not exactly," returned Kaliko. 

"Where is my brother now?"
"Where is  my brother now?"?" 

inquired Shaggy. "In the Metal Forest."
inquired  Shaggy.  "In the Metal Forest." 

"Where is that?"
"Where is  that?" 

"The Metal Forest is in the Great Domed Cavern, the largest in all our dominions," replied Kaliko.
The Metal Forest is  in  the  his Domed Cavern,  the largest in  ourions," replied Kaliko.

Kaliko hesitated.
Kaliko  hesitated. 

