In [11]:
from dataset import get_dataloader
from fairseq.models import BaseFairseqModel
from fairseq.models.wav2vec import (
    TransformerEncoder,
)
import torch
import multiprocessing as mp

from dinosr import DinoSR

In [12]:
# Set the multiprocessing start method to 'spawn'
mp.set_start_method('spawn', force=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if you have a series of n convolution layers, each convolution, i, has k_i, s_i, as a kernel and a stride. The output of the ith layer is given by:
$$ x_{i+1} = \frac{x_i - k_i}{s_i} $$
Also, itcan be re-written as:
$$ s_i x_{i+1} = x_i - k_i $$
where
$$ s_{i-1} x_i = x_{i-1} - k_{i-1} $$
This implies that:
$$ s_i s_{i-1} x_{i+1} = x_{i-1} - s_{i-1}k_i - k_{i-1}$$

And from here we can derive a general formula for the output of the nth layer, as a function of the input of first layer:
$$
\Pi_{i=1}^{n-1}{s_i} x_n = x_1 - \sum_{i=1}^{n-1}{\Pi_{j=i+1}^{n-1}{s_j}k_i}
$$

Note that the Sigma-Pi is not input dependent and hence, it is a constant depending only on the network architecture. This is an important quality, since it makes the computation of the relevant output window an O(1) operation.

To deal with boundries, we can write the following formula:
$$
\alpha = \Pi_{i=1}^{n-1}{s_i} \\
\beta = \sum_{i=1}^{n-1}{\Pi_{j=i+1}^{n-1}{s_j}k_i} \\
x_n \ge \lceil \frac{x_1 - \beta}{\alpha} \rceil
$$
where $\alpha$ and $\beta$ are constants depending only on the network architecture. Noting that x_n is almost equal to $\lceil \frac{x_1 - \beta}{\alpha} \rceil$ 

In [13]:
from dinosr_config import DinosrAudioConfig

cfg = DinosrAudioConfig(average_top_k_layers=4)

In [14]:
def model_creator(cfg):
    return TransformerEncoder(cfg)

dino_model = DinoSR(cfg, model_creator).to(device)

In [15]:
from model_persistant_state import ModelPersistantState

In [16]:
model_persistant_state = ModelPersistantState('./models/dino_transformer_model')
try:
    model_persistant_state.load_model(dino_model)
    print("loaded model successfully")
except:
    print("no model to load")

no model to load


In [9]:
from torch import optim

num_epochs = 1
batch_size = 320
mini_batch_size = 16
learning_rate = 0.0005

# Define the learning rate schedule
def lr_lambda(initial_step, step):
    warmup_steps = 12000
    hold_steps = 188000
    decay_steps = 200000
    initial_lr = 0.0005
    final_lr = 0.00005

    modified_step = step + initial_step

    if modified_step < warmup_steps:
        return modified_step / warmup_steps
    elif modified_step < warmup_steps + hold_steps:
        return 1.0
    else:
        decay_factor = (modified_step - (warmup_steps + hold_steps)) / decay_steps
        return initial_lr * ((final_lr / initial_lr) ** decay_factor) / initial_lr


optimizer = optim.Adam(dino_model1.parameters(), lr=0.0005)

# Define the learning rate scheduler
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda step: lr_lambda(model_persistant_state.get_current_step(), step))


In [10]:
trainset = get_dataloader(mini_batch_size)

In [11]:
import torch
import os
import yaml

# Function to save training history to a YAML file
def save_training_history(history, file_path):
    with open(file_path, 'w') as file:
        yaml.dump(history, file)

total_step = len(trainset)
n = batch_size // mini_batch_size  # Update parameters every n batches

batch_step = model_persistant_state.get_current_step()
batches_to_step = 3

for epoch in range(num_epochs):
    epoch_loss = 0.0
    epoch_accuracy = 0.0
    for i, (waveforms, lengths) in enumerate(trainset):
        step = epoch * total_step + i  # Calculate the current step

        # Forward pass
        results = dino_model(waveforms, lengths)
        loss = results['loss'] / n
        accuracy = results['accuracy']
        
        # Accumulate loss and accuracy
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
        # Backward pass
        loss.backward()
        
        # Accumulate gradients and update parameters
        if (i + 1) % n == 0 or (i + 1) == total_step:
            # Increment the batch step
            batch_step += 1
            
            optimizer.step()
            dino_model.update_teacher_params(batch_step=batch_step)
            optimizer.zero_grad()
            
            # Step the scheduler
            scheduler.step()

            # Save the model and training history
            model_persistant_state.save_model(
                step=batch_step,
                model=dino_model,
                performance={
                    'loss': epoch_loss / (i + 1),
                    'accuracy': epoch_accuracy / (i + 1)
                }
            )            

            print(f'\rEpoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{total_step}], Loss: {epoch_loss / (i + 1):.4f}, Accuracy: {100 * epoch_accuracy / (i + 1):.2f}% local accuracy is {100 * accuracy:.2f}%, and local loss is: {loss:.4f}', end='', flush=True)
            if batch_step == batches_to_step:
                break

    # Calculate and print cumulative loss and accuracy for the epoch
    avg_loss = epoch_loss / total_step
    avg_accuracy = epoch_accuracy / total_step
    print(f'\nEpoch [{epoch + 1}/{num_epochs}] Summary: Avg Loss: {avg_loss:.4f}, Avg Accuracy: {100 * avg_accuracy:.2f}%')
