In [None]:
import torch
import os
from tqdm import tqdm, trange
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split


In [None]:
root = "/mnt/new_volume2/vgg_sound_emb"
partition = "train"
data_dir = f"{root}/{partition}"

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

root = "/mnt/new_volume2/vgg_sound_emb"
partition = "train"
data_dir = f"{root}/{partition}"

class LargeVideoDataset(Dataset):
    def __init__(self, data_dir, subset_ratio = 0.2, transform=None):
        """
        root_dir: 保存所有 .pth 文件的目录，每个文件对应一个 sample。
        transform: 如果需要对数据做预处理，可在这里传入。
        """
        super().__init__()
        # 仅收集当前目录下所有的 pth 文件列表
        file_list = []

        for root, dirs, files in os.walk(data_dir):
            for file in files:
                if file.endswith(".pth"):
                    file_list.append(os.path.join(root, file))

        # 仅使用前 20% 的数据
        num_samples = int(len(file_list) * subset_ratio)

        self.file_paths = sorted(file_list)[:num_samples]
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        # 在这里按需读取，而不是一次性加载全部
        pth_path = self.file_paths[idx]
        sample_data = torch.load(pth_path)  
        clip_feat = sample_data['clip_features']  # (64, 512)
        clap_feat = sample_data['clap_features']  # (1, 512)

        if self.transform:
            clip_feat, clap_feat = self.transform((clip_feat, clap_feat))

        return clip_feat, clap_feat

In [None]:
vgg_sound = LargeVideoDataset(data_dir, subset_ratio = 0.1)

# DataLoader

In [None]:
val_ratio = 0.1
test_ratio = 0.1

total_len = len(vgg_sound)
val_len = int(total_len * val_ratio)
test_len = int(total_len * test_ratio)
train_len = total_len - val_len - test_len
train_dataset, val_dataset, test_dataset = random_split(
    vgg_sound, [train_len, val_len, test_len], generator=torch.Generator().manual_seed(42), 
)


In [None]:
batch_size = 64
num_workers = 4

In [None]:
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=num_workers,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=True
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=True
)

In [None]:
# get next batch from train_loader
next_batch = next(iter(train_loader))
clip_feat, clap_feat = next_batch
print("Clip Feature: ", clip_feat.shape)  # (64, 64, 512)
print("CLAP Feature: ", clap_feat.shape)  # (64, 1, 512)


Clip Feature:  torch.Size([64, 64, 512])
CLAP Feature:  torch.Size([64, 1, 512])


# Model

In [None]:


class V2AMapperMLP(nn.Module):
    """
    将(64,512)的clip特征先池化到(1,512),
    再映射到(1,512).
    """
    def __init__(self, input_dim=512, hidden_dim=1024, output_dim=512):
        super().__init__()
        # 可以先做一个简单的线性层, 或者堆叠多层
        self.pooling = nn.AdaptiveAvgPool2d((1, input_dim))  
        # pooling后, shape变成 (1, input_dim)

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.GELU(),
            nn.Linear(hidden_dim * 2, hidden_dim * 2),
            nn.GELU(),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        # x: (batch_size, 64, 512)
        # 先把 shape (B,64,512) pooling 到 (B,1,512)
        # 这里可以用简单的mean替代，也可以用AdaptiveAvgPool2d
        pooled = x.mean(dim=1)  # (B,512)

        # 送入多层感知机映射到(512)
        out = self.mlp(pooled)  # (B,512)
        return out


In [None]:
def train_model(model, train_loader, criterion, optimizer, scaler):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0
    for i, (clip_feat, clap_feat) in enumerate(train_loader):
        clip_feat = clip_feat.to(device)
        clap_feat = clap_feat.to(device)

        # 前向传播
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(clip_feat)  
            loss = criterion(outputs, clap_feat.squeeze(1)) 

        # 反向传播
        # loss.backward()
        # optimizer.step()

        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16


        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        del clip_feat, clap_feat, outputs, loss
        torch.cuda.empty_cache()

    return total_loss / len(train_loader)

    

In [None]:

def validate_model(model, val_loader, criterion, optimizer):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0
    vdist = 0
    for i, (clip_feat, clap_feat) in enumerate(val_loader):
        clip_feat = clip_feat.to(device)
        clap_feat = clap_feat.to(device)

        with torch.no_grad():
            outputs = model(clip_feat)  
            loss = criterion(outputs, clap_feat.squeeze(1)) 

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar
        del clip_feat, clap_feat, outputs, loss
        torch.cuda.empty_cache()
    
    batch_bar.close()
    return total_loss / len(val_loader)

In [None]:
def train(model, train_loader, val_loader, criterion, optimizer,scaler, ckpt_dir, num_epochs=10):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train_model(model, train_loader, criterion, optimizer, scaler)
        val_loss = validate_model(model, val_loader, criterion, optimizer)

        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Save the model if validation loss has decreased
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), ckpt_dir + "best_model.pth")
            print("Model saved!")
        else:
            print("Validation loss did not improve, model not saved.")
    print("Training complete!")
    print(f"Best validation loss: {best_val_loss:.4f}")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CosineLoss(nn.Module):
    def __init__(self, margin=0.8):  # 建议 margin 设得稍高一点
        super(CosineLoss, self).__init__()
        self.margin = margin

    def forward(self, output, target):
        # output: (B, 512)
        # target: (B, 1, 512) or (B, 512)
        if target.ndim == 3:
            target = target.squeeze(1)
        cos_sim = F.cosine_similarity(output, target, dim=1)  # (B,)
        loss = torch.mean(torch.clamp(self.margin - cos_sim, min=0))
        return loss


In [None]:
lr = 0.001

model = V2AMapperMLP(input_dim=512, hidden_dim=1024, output_dim=512).to(device)
# criterion = nn.MSELoss()
criterion = CosineLoss(margin= 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# scheduler = torch.optim.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.00001)
scaler = torch.cuda.amp.GradScaler()

ckpt_dir = "checkpoints"

train(model, train_loader, val_loader, criterion, optimizer,scaler, ckpt_dir = "ckpts/", num_epochs=30)

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/10


  with torch.cuda.amp.autocast():
                                                                                  

Train Loss: 0.5001, Val Loss: 0.4590
Model saved!
Epoch 2/10


                                                                                  

Train Loss: 0.4443, Val Loss: 0.4270
Model saved!
Epoch 3/10


                                                                                  

Train Loss: 0.4241, Val Loss: 0.4127
Model saved!
Epoch 4/10


                                                                                  

Train Loss: 0.4110, Val Loss: 0.4058
Model saved!
Epoch 5/10


                                                                                  

Train Loss: 0.3995, Val Loss: 0.3974
Model saved!
Epoch 6/10


                                                                                  

Train Loss: 0.3902, Val Loss: 0.3908
Model saved!
Epoch 7/10


                                                                                  

Train Loss: 0.3822, Val Loss: 0.3854
Model saved!
Epoch 8/10


                                                                                  

Train Loss: 0.3748, Val Loss: 0.3849
Model saved!
Epoch 9/10


                                                                                  

Train Loss: 0.3683, Val Loss: 0.3840
Model saved!
Epoch 10/10


                                                                                  

Train Loss: 0.3620, Val Loss: 0.3809
Model saved!
Training complete!
Best validation loss: 0.3809


