In [165]:
import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !git clone https://github.com/YuanGongND/ast
    sys.path.append('./ast')
%cd /content/ast/

! pip install timm==0.4.5
! pip install wget
import os, csv, argparse, wget
os.environ['TORCH_HOME'] = '/content/ast/pretrained_models'
if os.path.exists('/content/ast/pretrained_models') == False:
  os.mkdir('/content/ast/pretrained_models')
import torch, torchaudio, timm
import numpy as np
from torch.cuda.amp import autocast
import IPython
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms #pytorch 做data augmentation的套件
from torchvision import datasets #pytorch的dataset
import pandas as pd
from torch.utils.data import DataLoader, Dataset #pytorch 訓練的data以Dataset和dataloader呈現，通常是將一個dataset照自己的檔案整理、標籤好以後丟到dataloader中，以dataloader的形式進行訓練 
import time

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [184]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:

from src.models import ASTModel

# Create a new class that inherits the original ASTModel class
class ASTModelVis(ASTModel):
    @autocast()
    def forward(self, x):
        """
        :param x: the input spectrogram, expected shape: (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        :return: prediction
        """
        # expect input x = (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        x = x.unsqueeze(1)
        x = x.transpose(2, 3)

        B = x.shape[0]
        x = self.v.patch_embed(x)
        cls_tokens = self.v.cls_token.expand(B, -1, -1)
        dist_token = self.v.dist_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, dist_token, x), dim=1)
        x = x + self.v.pos_embed
        x = self.v.pos_drop(x)
        for blk in self.v.blocks:
            x = blk(x)
        x = self.v.norm(x)
        x = (x[:, 0] + x[:, 1]) / 2
        feature = x

        x = self.mlp_head(x)
        return x, feature
    

    def get_att_map(self, block, x):
        qkv = block.attn.qkv
        num_heads = block.attn.num_heads
        scale = block.attn.scale
        B, N, C = x.shape
        qkv = qkv(x).reshape(B, N, 3, num_heads, C // num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
        attn = (q @ k.transpose(-2, -1)) * scale
        attn = attn.softmax(dim=-1)
        return attn

    def forward_visualization(self, x):
        # expect input x = (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        x = x.unsqueeze(1)
        x = x.transpose(2, 3)

        B = x.shape[0]
        x = self.v.patch_embed(x)
        cls_tokens = self.v.cls_token.expand(B, -1, -1)
        dist_token = self.v.dist_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, dist_token, x), dim=1)
        x = x + self.v.pos_embed
        x = self.v.pos_drop(x)
        # save the attention map of each of 12 Transformer layer
        att_list = []
        for blk in self.v.blocks:
            cur_att = self.get_att_map(blk, x)
            att_list.append(cur_att)
            x = blk(x)
        return att_list
    

def make_features(wav_name, mel_bins, target_length=1024):
    waveform, sr = torchaudio.load(wav_name)
    assert sr == 16000, 'input audio sampling rate must be 16kHz'

    fbank = torchaudio.compliance.kaldi.fbank(
        waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
        window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10)

    n_frames = fbank.shape[0]

    p = target_length - n_frames
    if p > 0:
        m = torch.nn.ZeroPad2d((0, 0, 0, p))
        fbank = m(fbank)
    elif p < 0:
        fbank = fbank[0:target_length, :]

    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
    return fbank


def load_label(label_csv):
    with open(label_csv, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        lines = list(reader)
    labels = []
    ids = []  # Each label has a unique id such as "/m/068hy"
    for i1 in range(1, len(lines)):
        id = lines[i1][1]
        label = lines[i1][2]
        ids.append(id)
        labels.append(label)
    return labels

# Create an AST model and download the AudioSet pretrained weights
audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1'
if os.path.exists('/content/ast/pretrained_models/audio_mdl.pth') == False:
  wget.download(audioset_mdl_url, out='/content/ast/pretrained_models/audio_mdl.pth')

audioset_mdl_url = 'https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1'
if os.path.exists('/content/ast/pretrained_models/speechcommands_10_10_0.9812.pth') == False:
  wget.download(audioset_mdl_url, out='/content/ast/speechcommands_10_10_0.9812.pth')

pretrained_mdl_path = audioset_mdl_url
# get the frequency and time stride of the pretrained model from its name
fstride, tstride = int(pretrained_mdl_path.split('/')[-1].split('_')[1]), int(pretrained_mdl_path.split('/')[-1].split('_')[2].split('.')[0])

# Assume each input spectrogram has 1024 time frames
input_tdim = 1024
checkpoint_path =  '/content/ast/speechcommands_10_10_0.9812.pth'
# now load the visualization model
ast_mdl = ASTModelVis(label_dim=35, input_tdim=input_tdim, imagenet_pretrain=False, audioset_pretrain=False, fstride=fstride, tstride=tstride)
print(f'[*INFO] load checkpoint: {checkpoint_path}')


# audioset input sequence length is 1024
pretrained_mdl_path = '/content/ast/speechcommands_10_10_0.9812.pth'
# get the frequency and time stride of the pretrained model from its name
fstride, tstride = int(pretrained_mdl_path.split('/')[-1].split('_')[1]), int(pretrained_mdl_path.split('/')[-1].split('_')[2].split('.')[0])
# The input of audioset pretrained model is 1024 frames.
input_tdim = 1024

# initialize an AST model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sd = torch.load(pretrained_mdl_path, map_location=device)

new_ckpt = {}
for key in sd:
    if "v.pos_embed" in key or "mlp_head.1." in key:
      continue
    new_ckpt[key[7:]] = sd[key]

audio_model = ASTModelVis(input_tdim=input_tdim, fstride=fstride, tstride=tstride)
audio_model = torch.nn.DataParallel(audio_model)
audio_model.load_state_dict(new_ckpt, strict=False)

In [171]:
test_label_path = '/content/drive/MyDrive/LogME-CTC/different_upstream_model/n_y_test_speech.npy'
test_feature_path = '/content/drive/MyDrive/LogME-CTC/different_upstream_model/n_x_test_speech.npy'
test_features = np.load(test_feature_path)
test_labels = np.argmax(np.load(test_label_path), axis = 1)
test_features = np.squeeze(test_features, axis=3)

train_label_path = '/content/drive/MyDrive/LogME-CTC/different_upstream_model/n_y_train_speech.npy'
train_feature_path = '/content/drive/MyDrive/LogME-CTC/different_upstream_model/n_x_train_speech.npy'
train_features = np.load(train_feature_path)
train_labels = np.argmax(np.load(train_label_path),axis = 1)
train_features = np.squeeze(train_features, axis=3)

In [172]:

torch.manual_seed(0)
import random
random.seed(0)
np.random.seed(0)

In [203]:
from torch.utils.data import DataLoader
train_set = torch.utils.data.TensorDataset(torch.Tensor(train_features),torch.Tensor(train_labels))
val_num = 1000
batch_size = 1
train_set, val_set = torch.utils.data.random_split(train_set, [17049-val_num, val_num])
train_loader = DataLoader(train_set, batch_size=batch_size, sampler=None,pin_memory=True, num_workers=8,shuffle=True)
val_loader = DataLoader(train_set, batch_size=batch_size, sampler=None,pin_memory=True, num_workers=8,shuffle=True)

test_set = torch.utils.data.TensorDataset(torch.Tensor(test_features),torch.Tensor(test_labels))
test_loader = DataLoader(test_set, batch_size=batch_size, sampler=None,pin_memory=True, num_workers=8,shuffle=False)

In [178]:
data[1][1]

tensor(3.)

In [None]:
from tqdm import tqdm
model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=True, audioset_pretrain=False).cuda()
loss = nn.CrossEntropyLoss() # 因為是 classification task，所以 loss 使用 CrossEntropyLoss
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.0001) # optimizer 使用Adagrad
num_epoch = 5

#adam (momemtum)



for epoch in range(num_epoch):
    epoch_start_time = time.time()
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    model.train() # 確保 model 是在 train model (開啟 Dropout 等...)
    for i, data in enumerate(train_loader):
        optimizer.zero_grad() # 用 optimizer 將 model 參數的 gradient 
        train_pred,_ = model(data[0].cuda()) # 利用 model 得到預測的機率分佈 這邊實際上就是去呼叫 model 的 forward 函數
        batch_loss = loss(train_pred, data[1].type(torch.LongTensor).cuda()) # 計算 loss （注意 prediction 跟 label 必須同時在 CPU 或是 GPU 上）
        batch_loss.backward() # 利用 back propagation 算出每個參數的 gradient
        optimizer.step() # 以 optimizer 用 gradient 更新參數值
        print("\r",i, "/",len(train_set)/batch_size,sep="", end = "")

        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
        train_loss += batch_loss.item()
    
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            val_pred,_ = model(data[0].cuda())
            batch_loss = loss(val_pred, data[1].cuda())

            val_acc += np.sum(np.argmax(val_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
            val_loss += batch_loss.item()

        #將結果 print 出來
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, num_epoch, time.time()-epoch_start_time, \
             train_acc/train_set.__len__(), train_loss/train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))

In [200]:
extracted_features = torch.zeros([0,768])
model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(test_loader):
        train_pred, train_feature = model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == label.cpu().data.numpy())
        print("\r",idx,"/", len(test_loader),train_acc, end ="")
features_output = extracted_features
print(features_output.shape, train_acc/test_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/tuned_mel_features_output.npy",features_output.detach().numpy())



 4262 / 4263 3630torch.Size([4263, 768]) 0.8515130190007038


In [201]:
from src.models import ASTModel
ast_model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=True, audioset_pretrain=False).cuda()
extracted_features = torch.zeros([0,768])
ast_model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(test_loader):
        train_pred, train_feature = ast_model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == label.cpu().data.numpy())
        print("\r",idx,"/", len(test_loader),train_acc, end ="")
features_output = extracted_features
print(features_output.shape, train_acc/test_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/image_pretrained_mel_features_output.npy",features_output.detach().numpy())

---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=60
 4262 / 4263 367torch.Size([4263, 768]) 0.08608960825709594


In [202]:
from src.models import ASTModel
ast_model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=True, audioset_pretrain=True).cuda()
extracted_features = torch.zeros([0,768])
ast_model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(test_loader):
        train_pred, train_feature = ast_model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == label.cpu().data.numpy())
        print("\r",idx,"/", len(test_loader),train_acc, end ="")
features_output = extracted_features
print(features_output.shape, train_acc/test_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/audio_pretrained_mel_features_output.npy",features_output.detach().numpy())

---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=60
 4262 / 4263 447torch.Size([4263, 768]) 0.10485573539760731


In [204]:
from src.models import ASTModel
ast_model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=True, audioset_pretrain=True).cuda()
audio_model.load_state_dict(new_ckpt, strict=False)
extracted_features = torch.zeros([0,768])
ast_model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(test_loader):
        train_pred, train_feature = ast_model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == label.cpu().data.numpy())
        print("\r",idx,"/", len(test_loader),train_acc, end ="")
features_output = extracted_features
print(features_output.shape, train_acc/test_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/ks_pretrained_mel_features_output.npy",features_output.detach().numpy())

---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=60




 4262 / 4263 256torch.Size([4263, 768]) 0.0600516068496364


In [153]:
from src.models import ASTModel
ast_model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=False, audioset_pretrain=False).cuda()
extracted_features = torch.zeros([0,768])
ast_model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(train_loader):
        train_pred, train_feature = ast_model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == np.argmax(label.cpu().data.numpy(), axis=1))
        print("\r",idx,"/", len(train_loader),train_acc, end ="")
        torch.cuda.empty_cache()
features_output = extracted_features
print(features_output.shape, train_acc/train_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/no_pretrained_mel_features_output.npy",features_output.detach().numpy())

---------------AST Model Summary---------------
ImageNet pretraining: False, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=60
 33 / 34 439torch.Size([4263, 768]) 0.10297912268355618


In [96]:
from src.models import ASTModel
ast_model = ASTModelVis(label_dim=10, input_tdim=60, imagenet_pretrain=True, audioset_pretrain=True).cuda()
extracted_features = torch.zeros([0,768])
ast_model.eval()
train_acc = 0
with torch.no_grad():
    for idx, (data, label) in enumerate(train_loader):
        train_pred, train_feature = ast_model(data.cuda())
        extracted_features = torch.cat((extracted_features, train_feature.cpu()),0)
        train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == np.argmax(label.cpu().data.numpy(), axis=1))
        print("\r",idx,"/", len(train_loader),train_acc, end ="")
        torch.cuda.empty_cache()
features_output = extracted_features
print(features_output.shape, train_acc/train_set.__len__())
np.save("/content/drive/MyDrive/LogME-CTC/different_upstream_model/speech_command_pretrained_mel_features_output.npy",features_output.detach().numpy())

---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=60
 166 / 167 2339torch.Size([21312, 768]) 0.10975037537537538


In [None]:
from src.traintest import train, validate