In [1]:
import torch
import os
from tqdm import tqdm, trange
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from vgg_dataset import *
from models import *


In [2]:
root = "/mnt/new_volume2/vgg_sound_emb"
partition = "train"
data_dir = f"{root}/{partition}"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
root = "/mnt/new_volume2/vgg_sound_emb"
partition = "train"
data_dir = f"{root}/{partition}"


In [5]:
# vgg_sound = LargeVideoDataset(data_dir, subset_ratio = 0.2)
vgg_sound = InMemoryVideoDataset(data_dir, subset_ratio=0.7)

Loading 127837 samples into memory as torch.float16 …


Caching data: 100%|██████████| 127837/127837 [14:11<00:00, 150.14it/s]


# DataLoader

In [6]:
val_ratio = 0.1
test_ratio = 0.1

total_len = len(vgg_sound)
val_len = int(total_len * val_ratio)
test_len = int(total_len * test_ratio)
train_len = total_len - val_len - test_len
train_dataset, val_dataset, test_dataset = random_split(
    vgg_sound, [train_len, val_len, test_len], generator=torch.Generator().manual_seed(42), 
)


In [7]:
batch_size = 512
num_workers = 8

In [8]:
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=num_workers,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=True
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=True
)

In [9]:
clip_feat, clap_feat = next(iter(train_loader))
print("Clip:", clip_feat.device, clip_feat.dtype)
print("Clap:", clap_feat.device, clap_feat.dtype)

Clip: cpu torch.float16
Clap: cpu torch.float16


# Wandb

In [10]:
import wandb

# Model

In [11]:
def train_model(model, train_loader, criterion, optimizer, scaler):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0
    for i, (clip_feat, clap_feat) in enumerate(train_loader):
        # clip_feat = clip_feat.to(device)
        # clap_feat = clap_feat.to(device)
        # clip_feat = clip_feat.to(torch.float32).to(device)
        # clap_feat = clap_feat.to(torch.float32).to(device)
        clip_feat = clip_feat.float().to(device)
        clap_feat = clap_feat.float().to(device)
        if i == 0:
            print(f"[Debug] clip_feat device = {clip_feat.device}, clap_feat device = {clap_feat.device}")
        


        # 前向传播
        optimizer.zero_grad()
        # with torch.cuda.amp.autocast():
        with torch.amp.autocast('cuda', enabled=True):
            outputs = model(clip_feat)  
            loss = criterion(outputs, clap_feat.squeeze(1)) 

        # 反向传播
        # loss.backward()
        # optimizer.step()

        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16


        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # del clip_feat, clap_feat, outputs, loss
        # torch.cuda.empty_cache()

    return total_loss / len(train_loader)

    

In [27]:

def validate_model(model, val_loader, criterion, optimizer):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0
    for i, (clip_feat, clap_feat) in enumerate(val_loader):
        clip_feat = clip_feat.float().to(device)
        clap_feat = clap_feat.float().to(device)

        with torch.no_grad():
            outputs = model(clip_feat)  
            loss = criterion(outputs, clap_feat.squeeze(1)) 

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar
        # del clip_feat, clap_feat, outputs, loss
        # torch.cuda.empty_cache()
    
    batch_bar.close()
    return total_loss / len(val_loader)

In [28]:
def train(model, train_loader, val_loader, criterion, optimizer,scaler, scheduler, ckpt_dir, num_epochs=10):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        curr_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train_model(model, train_loader, criterion, optimizer, scaler)
        val_loss = validate_model(model, val_loader, criterion, optimizer)
        scheduler.step()
        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        if True:
            wandb.log({
                'train_loss': train_loss,
                'val_loss': val_loss,
                'lr': curr_lr
        })

        # Save the model if validation loss has decreased
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), ckpt_dir + "best_model.pth")
            print("Model saved!")
        else:
            print("Validation loss did not improve, model not saved.")
            
        torch.save(model.state_dict(), ckpt_dir + "last_model.pth")
    print("Training complete!")
    print(f"Best validation loss: {best_val_loss:.4f}")

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CosineLoss(nn.Module):
    def __init__(self, margin=0.8):  # 建议 margin 设得稍高一点
        super(CosineLoss, self).__init__()
        self.margin = margin

    def forward(self, output, target):
        # output: (B, 512)
        # target: (B, 1, 512) or (B, 512)
        if target.ndim == 3:
            target = target.squeeze(1)
        cos_sim = F.cosine_similarity(output, target, dim=1)  # (B,)
        loss = torch.mean(torch.clamp(self.margin - cos_sim, min=0))
        return loss


In [30]:
print(device)

cuda


In [1]:
model = V2AMapperMLPImproved(input_dim=512, hidden_dim=2048, output_dim=512)

NameError: name 'V2AMapperMLPImproved' is not defined

In [2]:
# show me the mode summary 
from torchsummary import summary
# model = V2AMapperMLP(input_dim=512, hidden_dim=512, output_dim=512).to(device)
# model = V2AMapperMLPImproved().to(device)

summary(model, input_size=(64, 512))

NameError: name 'model' is not defined

In [None]:
# Use wandb? Resume Training?
USE_WANDB = True

RESUME_LOGGING = False # Set this to true if you are resuming training from a previous run

# Create your wandb run

run_name = 'mlp-vggsound-simplefied_mapper' # Give your run a name, this will be used to identify the run in wandb

# If you are resuming an old run
if USE_WANDB:

    wandb.login(key="8475199febe13b3465c7d5e4a595bba7422c14fc") #TODO

    if RESUME_LOGGING:
        run = wandb.init(
            id     = "", ### Insert specific run id here if you want to resume a previous run
            resume = "must", ### You need this to resume previous runs
            project = "v2amapper", ### Project should be created in your wandb
            settings = wandb.Settings(_service_wait=300)
        )


    else:
        run = wandb.init(
            name    = run_name, ### Wandb creates random run names if you skip this field, we recommend you give useful names
            reinit  = True, ### Allows reinitalizing runs when you re-run this cell
            project = "v2amapper", ### Project should be created in your wandb account
        )

        ### Save your model architecture as a string with str(model)
        model_arch  = str(model)
        ### Save it in a txt file
        arch_file   = open("model_arch.txt", "w")
        file_write  = arch_file.write(model_arch)
        arch_file.close()

        ### log it in your wandb run with wandb.save()
        wandb.save('model_arch.txt')



In [34]:
lr = 0.001
epochs = 120
# model = V2AMapperMLP(input_dim=512, hidden_dim=1024, output_dim=512).to(device)
criterion = nn.MSELoss()
# criterion = CosineLoss(margin= 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# scheduler = torch.optim.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.00001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0.000005)
scaler = torch.amp.GradScaler(enabled=True)

# ckpt_dir = "checkpoints"

train(model, train_loader, val_loader, criterion, optimizer,scaler, scheduler, ckpt_dir = "ckpts/", num_epochs=epochs)

Epoch 1/120


Train:   0%|          | 1/200 [00:02<08:13,  2.48s/it, loss=0.9838, lr=0.001000]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.6306, Val Loss: 0.3226
Model saved!
Epoch 2/120


Train:   2%|▏         | 4/200 [00:01<01:11,  2.76it/s, loss=0.3175, lr=0.001000]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.1674, Val Loss: 0.0758
Model saved!
Epoch 3/120


Train:   2%|▎         | 5/200 [00:01<01:09,  2.81it/s, loss=0.0745, lr=0.000999]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0291, Val Loss: 0.0024
Model saved!
Epoch 4/120


Train:   2%|▎         | 5/200 [00:01<01:07,  2.87it/s, loss=0.0023, lr=0.000998]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0018, Val Loss: 0.0014
Model saved!
Epoch 5/120


Train:   2%|▏         | 3/200 [00:01<01:31,  2.15it/s, loss=0.0014, lr=0.000997]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0013, Val Loss: 0.0012
Model saved!
Epoch 6/120


Train:   2%|▎         | 5/200 [00:01<01:08,  2.86it/s, loss=0.0012, lr=0.000996]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0012, Val Loss: 0.0012
Model saved!
Epoch 7/120


Train:   2%|▏         | 4/200 [00:01<01:11,  2.76it/s, loss=0.0012, lr=0.000994]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0012, Val Loss: 0.0012
Validation loss did not improve, model not saved.
Epoch 8/120


Train:   2%|▏         | 4/200 [00:01<01:31,  2.14it/s, loss=0.0012, lr=0.000992]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0012, Val Loss: 0.0012
Model saved!
Epoch 9/120


Train:   2%|▏         | 4/200 [00:01<01:09,  2.81it/s, loss=0.0012, lr=0.000989]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


                                                                                  

Train Loss: 0.0012, Val Loss: 0.0012
Model saved!
Epoch 10/120


Train:   2%|▏         | 4/200 [00:01<01:10,  2.79it/s, loss=0.0011, lr=0.000986]

[Debug] clip_feat device = cuda:0, clap_feat device = cuda:0


Train: 100%|██████████| 200/200 [00:08<00:00, 32.04it/s, loss=0.0011, lr=0.000986]

KeyboardInterrupt: 

Train: 100%|██████████| 200/200 [00:25<00:00, 32.04it/s, loss=0.0011, lr=0.000986]