In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("HF_TOKEN")
# from huggingface_hub import HfApi, login
# from kaggle_secrets import UserSecretsClient
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import sys
import subprocess
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torchaudio

# AST Fine-tuning on UrbanSound8K (Kaggle version)
# - Auto-detect UrbanSound8K in /kaggle/input
# - Clone AST repo into /kaggle/working/ast
# - Download AudioSet checkpoint
# - 10-fold cross-validation training



# 0. ENV INFO & SEED

print("Listing /kaggle/input ...")


SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)



# 1. INSTALL REQUIRED LIBRARIES (if needed)
import gc # <--- Import thư viện dọn rác

def pip_install(package):
    # Thêm "--no-cache-dir" để không ngốn RAM lưu file tạm
    # Thêm "-q" (quiet) để giảm log in ra màn hình
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "-q", package])

print("[*INFO] Installing libraries...")
# timm==0.4.5 là bản được AST repo dùng
pip_install("timm==0.4.5")
pip_install("wget")
pip_install("librosa")  


print("[*INFO] Cleaning up memory after installation...")
gc.collect() 
if torch.cuda.is_available():
    torch.cuda.empty_cache()
# ======================================================

import timm
import wget

# 2. CLONE AST REPO VÀ IMPORT ASTModel
REPO_DIR = "/kaggle/working/ast"

if not os.path.exists(REPO_DIR):
    print("[*INFO] Cloning AST repo ...")
    subprocess.check_call(["git", "clone", "https://github.com/YuanGongND/ast", REPO_DIR])
else:
    print("[*INFO] AST repo already exists at", REPO_DIR)

sys.path.append(REPO_DIR)
os.chdir(REPO_DIR)

from src.models import ASTModel  # AST model class


Listing /kaggle/input ...
Device: cuda
[*INFO] Installing libraries...
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 287.4/287.4 kB 10.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 323.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 263.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 325.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 194.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 325.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 327.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 332.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 316.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 329.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 328.1 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.


[*INFO] Cleaning up memory after installation...
[*INFO] Cloning AST repo ...


Cloning into '/kaggle/working/ast'...
  @autocast()


In [None]:

# 3. TÌM METADATA CSV UrbanSound8K

US8K_META_PATH = None
for root, dirs, files in os.walk("/kaggle/input"):
    if "UrbanSound8K.csv" in files:
        US8K_META_PATH = os.path.join(root, "UrbanSound8K.csv")
        break

if US8K_META_PATH is None:
    raise FileNotFoundError(
        "Không tìm thấy UrbanSound8K.csv trong /kaggle/input. "
        "Kiểm tra lại dataset UrbanSound8K đã add vào notebook."
    )

print("[*INFO] Found UrbanSound8K.csv at:", US8K_META_PATH)
METADATA_CSV = US8K_META_PATH


# 4. TÌM CHÍNH XÁC AUDIO_ROOT BẰNG CÁCH DÒ FILE THẬT

df_meta = pd.read_csv(METADATA_CSV)
sample_row = df_meta.iloc[0]
sample_fname = sample_row["slice_file_name"]
sample_fold = sample_row["fold"]
target_fold_dir = f"fold{sample_fold}"

print(f"[*INFO] Sample row -> fold={sample_fold}, fname={sample_fname}")

AUDIO_ROOT = None
for root, dirs, files in os.walk("/kaggle/input"):
    if os.path.basename(root) == target_fold_dir and sample_fname in files:
        # root = .../audio/foldX
        AUDIO_ROOT = os.path.dirname(root)  # bỏ /foldX
        break

if AUDIO_ROOT is None:
    raise FileNotFoundError(
        f"Không tìm thấy thư mục audio chứa fold{sample_fold} và file {sample_fname} trong /kaggle/input."
    )

print("[*INFO] AUDIO_ROOT detected as:", AUDIO_ROOT)

# Thư mục lưu checkpoint
CKPT_DIR = "/kaggle/working/ast_us8k_checkpoints"
os.makedirs(CKPT_DIR, exist_ok=True)

[*INFO] Found UrbanSound8K.csv at: /kaggle/input/urbansound8k/UrbanSound8K.csv
[*INFO] Sample row -> fold=5, fname=100032-3-0-0.wav
[*INFO] AUDIO_ROOT detected as: /kaggle/input/urbansound8k


In [3]:
print(AUDIO_ROOT)

/kaggle/input/urbansound8k


new code

In [4]:
from pathlib import Path
try:
    REPO_DIR
except NameError:
    REPO_DIR = Path('.').resolve()
else:
    REPO_DIR = Path(REPO_DIR)
TORCH_HOME = REPO_DIR / 'pretrained_models'
os.environ['TORCH_HOME'] = str(TORCH_HOME)
TORCH_HOME.mkdir(exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Repo root:', REPO_DIR)
print('Device:', device)

Repo root: /kaggle/working/ast
Device: cuda


In [5]:
# Build label CSV (index, mid, display_name)
meta_df = pd.read_csv(METADATA_CSV)
label_csv = REPO_DIR / 'urban8k_data' / 'urban8k_class_labels_indices.csv'
label_csv.parent.mkdir(exist_ok=True)

unique_classes = meta_df[['classID', 'class']].drop_duplicates().sort_values('classID')
with open(label_csv, 'w') as f:
    f.write('index,mid,display_name\n')
    for _, row in unique_classes.iterrows():
        idx = int(row['classID'])
        mid = f"/m/urban{idx:02d}"
        name = row['class']
        f.write(f"{idx},{mid},{name}\n")

print('Label CSV written to:', label_csv)


Label CSV written to: /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv


In [6]:
import json, shutil

# Create JSON datafiles per fold (ESC-50 style)
data_dir = REPO_DIR / 'urban8k_data' / 'datafiles'
data_dir.mkdir(parents=True, exist_ok=True)

def rows_for_df(df):
    rows = []
    for _, r in df.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        rows.append({
            'wav': str(wav_path),
            'labels': f"/m/urban{int(r['classID']):02d}"
        })
    return rows

for fold in range(1, 11):
    train_df = meta_df[meta_df['fold'] != fold]
    eval_df = meta_df[meta_df['fold'] == fold]
    train_json = {'data': rows_for_df(train_df)}
    eval_json = {'data': rows_for_df(eval_df)}
    with open(data_dir / f'urban_train_data_{fold}.json', 'w') as f:
        json.dump(train_json, f, indent=1)
    with open(data_dir / f'urban_eval_data_{fold}.json', 'w') as f:
        json.dump(eval_json, f, indent=1)
    print(f'Fold {fold}: train {len(train_df)}, eval {len(eval_df)}')


Fold 1: train 7859, eval 873
Fold 2: train 7844, eval 888
Fold 3: train 7807, eval 925
Fold 4: train 7742, eval 990
Fold 5: train 7796, eval 936
Fold 6: train 7909, eval 823
Fold 7: train 7894, eval 838
Fold 8: train 7926, eval 806
Fold 9: train 7916, eval 816
Fold 10: train 7895, eval 837


In [None]:
# Compute dataset mean/std over fbank features (optional; default uses AudioSet stats)
import torchaudio

def compute_norm_stats(df, target_length=1024, mel_bins=128, target_sr=16000, max_files=None):
    total = 0.0
    total_sq = 0.0
    count = 0
    rows = df if max_files is None else df.sample(n=max_files, random_state=SEED)
    for _, r in rows.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        wav, sr = torchaudio.load(str(wav_path))
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if sr != target_sr:
            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
            sr = target_sr
        fb = torchaudio.compliance.kaldi.fbank(
            wav, htk_compat=True, sample_frequency=sr, use_energy=False,
            window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10
        )
        p = target_length - fb.shape[0]
        if p > 0:
            fb = torch.nn.functional.pad(fb, (0, 0, 0, p))
        elif p < 0:
            fb = fb[:target_length, :]
        total += fb.sum().item()
        total_sq += (fb ** 2).sum().item()
        count += fb.numel()
    mean = total / count
    var = total_sq / count - mean ** 2
    std = var ** 0.5
    return mean, std


try:
    DATASET_MEAN
    DATASET_STD
except NameError:
    DATASET_MEAN = -1.7362523965764032
    DATASET_STD = 3.2758855893221015
print('Using mean/std:', DATASET_MEAN, DATASET_STD)


Using mean/std: -1.7362523965764032 3.2758855893221015


In [8]:
REPO_DIR = Path('.').resolve()
PRETRAIN_DIR = (REPO_DIR / '../../pretrained_models').resolve()
PRETRAIN_DIR.mkdir(parents=True, exist_ok=True)
os.environ['TORCH_HOME'] = str(PRETRAIN_DIR)
print('TORCH_HOME:', PRETRAIN_DIR)
# Helper to launch training via src/run.py with ESC-50-like hyperparameters
def run_fold(fold, epochs=3, batch_size=12, lr=1e-5):
    exp_dir = REPO_DIR / 'urban8k_exp' / f'fold{fold}'
    if exp_dir.exists():
        shutil.rmtree(exp_dir)
    exp_dir.mkdir(parents=True, exist_ok=True)
    train_json = data_dir / f'urban_train_data_{fold}.json'
    eval_json = data_dir / f'urban_eval_data_{fold}.json'

    cmd = [
        sys.executable, 'src/run.py',
        '--model', 'ast',
        '--dataset', 'urban8k',
        '--data-train', str(train_json),
        '--data-val', str(eval_json),
        '--exp-dir', str(exp_dir),
        '--label-csv', str(label_csv),
        '--n_class', '10',
        '--lr', str(lr),
        '--n-epochs', str(epochs),
        '--batch-size', str(batch_size),
        '--save_model', 'False',
        '--freqm', '24',
        '--timem', '96',
        '--mixup', '0',
        '--bal', 'none',
        '--tstride', '10', '--fstride', '10',
        '--imagenet_pretrain', 'True',
        '--audioset_pretrain', 'True',
        '--metrics', 'acc',
        '--loss', 'CE',
        '--warmup', 'False',
        '--lrscheduler_start', '5',
        '--lrscheduler_step', '1',
        '--lrscheduler_decay', '0.85',
        '--dataset_mean', str(DATASET_MEAN),
        '--dataset_std', str(DATASET_STD),
        '--audio_length', '1024',
        '--noise', 'False',
        '--num-workers', '4',
        '--n-print-steps', '50'
    ]
    env = os.environ.copy()
    env['TORCH_HOME'] = str(TORCH_HOME)
    print(f'Running fold {fold}:', ' '.join(cmd))
    subprocess.check_call(cmd, env=env, cwd=REPO_DIR)

print('Helper ready: run_fold(fold) to launch training')



TORCH_HOME: /kaggle/pretrained_models
Helper ready: run_fold(fold) to launch training


In [9]:
# Kick off full 10-fold: each fold is held out as test once
folds_to_run = list(range(1, 6))
for f in folds_to_run:
    run_fold(f)

# Aggregate best validation accuracy per fold (col 0 of result.csv)
fold_acc = []
for f in folds_to_run:
    res_path = REPO_DIR / 'urban8k_exp' / f'fold{f}' / 'result.csv'
    arr = np.loadtxt(res_path, delimiter=',')
    if arr.ndim == 1:
        arr = arr[None, :]
    best_acc = float(arr[:, 0].max())
    fold_acc.append(best_acc)
    print(f'Fold {f}: best val acc {best_acc:.4f}')

mean_acc = float(np.mean(fold_acc))
std_acc = float(np.std(fold_acc))
print(f'10-fold mean acc: {mean_acc:.4f} +/- {std_acc:.4f}')


Running fold 1: /usr/bin/python3 src/run.py --model ast --dataset urban8k --data-train /kaggle/working/ast/urban8k_data/datafiles/urban_train_data_1.json --data-val /kaggle/working/ast/urban8k_data/datafiles/urban_eval_data_1.json --exp-dir /kaggle/working/ast/urban8k_exp/fold1 --label-csv /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv --n_class 10 --lr 1e-05 --n-epochs 3 --batch-size 12 --save_model False --freqm 24 --timem 96 --mixup 0 --bal none --tstride 10 --fstride 10 --imagenet_pretrain True --audioset_pretrain True --metrics acc --loss CE --warmup False --lrscheduler_start 5 --lrscheduler_step 1 --lrscheduler_decay 0.85 --dataset_mean -1.7362523965764032 --dataset_std 3.2758855893221015 --audio_length 1024 --noise False --num-workers 4 --n-print-steps 50


  @autocast()
  scaler = GradScaler()


I am process 88, running on 22c3b1e9ac95: starting (Fri Dec 12 11:09:37 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold1
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.734

  with autocast():


Epoch: [1][50/655]	Per Sample Total Time 0.06713	Per Sample Data Time 0.00125	Per Sample DNN Time 0.06587	Train Loss 1.2545	
Epoch: [1][100/655]	Per Sample Total Time 0.06590	Per Sample Data Time 0.00065	Per Sample DNN Time 0.06525	Train Loss 0.9013	
Epoch: [1][150/655]	Per Sample Total Time 0.06645	Per Sample Data Time 0.00044	Per Sample DNN Time 0.06601	Train Loss 0.7116	
Epoch: [1][200/655]	Per Sample Total Time 0.06709	Per Sample Data Time 0.00034	Per Sample DNN Time 0.06675	Train Loss 0.6116	
Epoch: [1][250/655]	Per Sample Total Time 0.06733	Per Sample Data Time 0.00028	Per Sample DNN Time 0.06706	Train Loss 0.5353	
Epoch: [1][300/655]	Per Sample Total Time 0.06757	Per Sample Data Time 0.00024	Per Sample DNN Time 0.06733	Train Loss 0.4872	
Epoch: [1][350/655]	Per Sample Total Time 0.06768	Per Sample Data Time 0.00021	Per Sample DNN Time 0.06747	Train Loss 0.4447	
Epoch: [1][400/655]	Per Sample Total Time 0.06775	Per Sample Data Time 0.00018	Per Sample DNN Time 0.06757	Train Loss 0

  with autocast():


Epoch: [2][45/655]	Per Sample Total Time 0.06993	Per Sample Data Time 0.00142	Per Sample DNN Time 0.06851	Train Loss 0.1016	
Epoch: [2][95/655]	Per Sample Total Time 0.06893	Per Sample Data Time 0.00070	Per Sample DNN Time 0.06823	Train Loss 0.0848	
Epoch: [2][145/655]	Per Sample Total Time 0.06865	Per Sample Data Time 0.00047	Per Sample DNN Time 0.06818	Train Loss 0.0773	
Epoch: [2][195/655]	Per Sample Total Time 0.06851	Per Sample Data Time 0.00036	Per Sample DNN Time 0.06816	Train Loss 0.0772	
Epoch: [2][245/655]	Per Sample Total Time 0.06840	Per Sample Data Time 0.00029	Per Sample DNN Time 0.06811	Train Loss 0.0813	
Epoch: [2][295/655]	Per Sample Total Time 0.06831	Per Sample Data Time 0.00025	Per Sample DNN Time 0.06807	Train Loss 0.0816	
Epoch: [2][345/655]	Per Sample Total Time 0.06827	Per Sample Data Time 0.00021	Per Sample DNN Time 0.06806	Train Loss 0.0775	
Epoch: [2][395/655]	Per Sample Total Time 0.06822	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06803	Train Loss 0.

  with autocast():


Epoch: [3][40/655]	Per Sample Total Time 0.06937	Per Sample Data Time 0.00112	Per Sample DNN Time 0.06825	Train Loss 0.0522	
Epoch: [3][90/655]	Per Sample Total Time 0.06847	Per Sample Data Time 0.00052	Per Sample DNN Time 0.06795	Train Loss 0.0460	
Epoch: [3][140/655]	Per Sample Total Time 0.06816	Per Sample Data Time 0.00035	Per Sample DNN Time 0.06782	Train Loss 0.0431	
Epoch: [3][190/655]	Per Sample Total Time 0.06806	Per Sample Data Time 0.00026	Per Sample DNN Time 0.06779	Train Loss 0.0434	
Epoch: [3][240/655]	Per Sample Total Time 0.06800	Per Sample Data Time 0.00021	Per Sample DNN Time 0.06779	Train Loss 0.0428	
Epoch: [3][290/655]	Per Sample Total Time 0.06796	Per Sample Data Time 0.00018	Per Sample DNN Time 0.06778	Train Loss 0.0473	
Epoch: [3][340/655]	Per Sample Total Time 0.06793	Per Sample Data Time 0.00016	Per Sample DNN Time 0.06777	Train Loss 0.0466	
Epoch: [3][390/655]	Per Sample Total Time 0.06792	Per Sample Data Time 0.00014	Per Sample DNN Time 0.06777	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 4370, running on 22c3b1e9ac95: starting (Fri Dec 12 11:37:48 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold2
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.7

  with autocast():


Epoch: [1][50/654]	Per Sample Total Time 0.07348	Per Sample Data Time 0.00104	Per Sample DNN Time 0.07244	Train Loss 1.3355	
Epoch: [1][100/654]	Per Sample Total Time 0.07093	Per Sample Data Time 0.00054	Per Sample DNN Time 0.07039	Train Loss 0.9613	
Epoch: [1][150/654]	Per Sample Total Time 0.07027	Per Sample Data Time 0.00037	Per Sample DNN Time 0.06990	Train Loss 0.7775	
Epoch: [1][200/654]	Per Sample Total Time 0.06984	Per Sample Data Time 0.00029	Per Sample DNN Time 0.06955	Train Loss 0.6593	
Epoch: [1][250/654]	Per Sample Total Time 0.06960	Per Sample Data Time 0.00023	Per Sample DNN Time 0.06937	Train Loss 0.5756	
Epoch: [1][300/654]	Per Sample Total Time 0.06944	Per Sample Data Time 0.00020	Per Sample DNN Time 0.06924	Train Loss 0.5145	
Epoch: [1][350/654]	Per Sample Total Time 0.06928	Per Sample Data Time 0.00018	Per Sample DNN Time 0.06910	Train Loss 0.4700	
Epoch: [1][400/654]	Per Sample Total Time 0.06914	Per Sample Data Time 0.00016	Per Sample DNN Time 0.06898	Train Loss 0

  with autocast():


Epoch: [2][46/654]	Per Sample Total Time 0.06965	Per Sample Data Time 0.00122	Per Sample DNN Time 0.06843	Train Loss 0.0740	
Epoch: [2][96/654]	Per Sample Total Time 0.06887	Per Sample Data Time 0.00061	Per Sample DNN Time 0.06826	Train Loss 0.0648	
Epoch: [2][146/654]	Per Sample Total Time 0.06857	Per Sample Data Time 0.00041	Per Sample DNN Time 0.06816	Train Loss 0.0703	
Epoch: [2][196/654]	Per Sample Total Time 0.06852	Per Sample Data Time 0.00031	Per Sample DNN Time 0.06820	Train Loss 0.0703	
Epoch: [2][246/654]	Per Sample Total Time 0.06841	Per Sample Data Time 0.00026	Per Sample DNN Time 0.06816	Train Loss 0.0681	
Epoch: [2][296/654]	Per Sample Total Time 0.06833	Per Sample Data Time 0.00022	Per Sample DNN Time 0.06812	Train Loss 0.0720	
Epoch: [2][346/654]	Per Sample Total Time 0.06829	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06810	Train Loss 0.0715	
Epoch: [2][396/654]	Per Sample Total Time 0.06825	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06808	Train Loss 0.

  with autocast():


Epoch: [3][42/654]	Per Sample Total Time 0.06961	Per Sample Data Time 0.00146	Per Sample DNN Time 0.06815	Train Loss 0.0645	
Epoch: [3][92/654]	Per Sample Total Time 0.06864	Per Sample Data Time 0.00069	Per Sample DNN Time 0.06795	Train Loss 0.0564	
Epoch: [3][142/654]	Per Sample Total Time 0.06841	Per Sample Data Time 0.00046	Per Sample DNN Time 0.06795	Train Loss 0.0525	
Epoch: [3][192/654]	Per Sample Total Time 0.06826	Per Sample Data Time 0.00035	Per Sample DNN Time 0.06791	Train Loss 0.0536	
Epoch: [3][242/654]	Per Sample Total Time 0.06817	Per Sample Data Time 0.00028	Per Sample DNN Time 0.06789	Train Loss 0.0483	
Epoch: [3][292/654]	Per Sample Total Time 0.06810	Per Sample Data Time 0.00024	Per Sample DNN Time 0.06786	Train Loss 0.0473	
Epoch: [3][342/654]	Per Sample Total Time 0.06807	Per Sample Data Time 0.00021	Per Sample DNN Time 0.06786	Train Loss 0.0455	
Epoch: [3][392/654]	Per Sample Total Time 0.06805	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06787	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 8646, running on 22c3b1e9ac95: starting (Fri Dec 12 12:05:58 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold3
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.7

  with autocast():


Epoch: [1][50/651]	Per Sample Total Time 0.07346	Per Sample Data Time 0.00108	Per Sample DNN Time 0.07238	Train Loss 1.4164	
Epoch: [1][100/651]	Per Sample Total Time 0.07100	Per Sample Data Time 0.00056	Per Sample DNN Time 0.07044	Train Loss 0.9922	
Epoch: [1][150/651]	Per Sample Total Time 0.07037	Per Sample Data Time 0.00039	Per Sample DNN Time 0.06998	Train Loss 0.7733	
Epoch: [1][200/651]	Per Sample Total Time 0.06997	Per Sample Data Time 0.00030	Per Sample DNN Time 0.06967	Train Loss 0.6520	
Epoch: [1][250/651]	Per Sample Total Time 0.06975	Per Sample Data Time 0.00024	Per Sample DNN Time 0.06951	Train Loss 0.5817	
Epoch: [1][300/651]	Per Sample Total Time 0.06961	Per Sample Data Time 0.00021	Per Sample DNN Time 0.06940	Train Loss 0.5259	
Epoch: [1][350/651]	Per Sample Total Time 0.06950	Per Sample Data Time 0.00018	Per Sample DNN Time 0.06931	Train Loss 0.4773	
Epoch: [1][400/651]	Per Sample Total Time 0.06940	Per Sample Data Time 0.00016	Per Sample DNN Time 0.06923	Train Loss 0

  with autocast():


Epoch: [2][49/651]	Per Sample Total Time 0.06986	Per Sample Data Time 0.00102	Per Sample DNN Time 0.06883	Train Loss 0.0901	
Epoch: [2][99/651]	Per Sample Total Time 0.06914	Per Sample Data Time 0.00053	Per Sample DNN Time 0.06861	Train Loss 0.0968	
Epoch: [2][149/651]	Per Sample Total Time 0.06900	Per Sample Data Time 0.00036	Per Sample DNN Time 0.06864	Train Loss 0.0824	
Epoch: [2][199/651]	Per Sample Total Time 0.06886	Per Sample Data Time 0.00028	Per Sample DNN Time 0.06858	Train Loss 0.0762	
Epoch: [2][249/651]	Per Sample Total Time 0.06872	Per Sample Data Time 0.00023	Per Sample DNN Time 0.06849	Train Loss 0.0749	
Epoch: [2][299/651]	Per Sample Total Time 0.06867	Per Sample Data Time 0.00020	Per Sample DNN Time 0.06847	Train Loss 0.0676	
Epoch: [2][349/651]	Per Sample Total Time 0.06860	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06843	Train Loss 0.0702	
Epoch: [2][399/651]	Per Sample Total Time 0.06861	Per Sample Data Time 0.00015	Per Sample DNN Time 0.06846	Train Loss 0.

  with autocast():


Epoch: [3][48/651]	Per Sample Total Time 0.06966	Per Sample Data Time 0.00101	Per Sample DNN Time 0.06865	Train Loss 0.0562	
Epoch: [3][98/651]	Per Sample Total Time 0.06905	Per Sample Data Time 0.00052	Per Sample DNN Time 0.06853	Train Loss 0.0503	
Epoch: [3][148/651]	Per Sample Total Time 0.06873	Per Sample Data Time 0.00035	Per Sample DNN Time 0.06838	Train Loss 0.0448	
Epoch: [3][198/651]	Per Sample Total Time 0.06870	Per Sample Data Time 0.00027	Per Sample DNN Time 0.06842	Train Loss 0.0398	
Epoch: [3][248/651]	Per Sample Total Time 0.06859	Per Sample Data Time 0.00022	Per Sample DNN Time 0.06837	Train Loss 0.0423	
Epoch: [3][298/651]	Per Sample Total Time 0.06850	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06831	Train Loss 0.0443	
Epoch: [3][348/651]	Per Sample Total Time 0.06846	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06830	Train Loss 0.0478	
Epoch: [3][398/651]	Per Sample Total Time 0.06845	Per Sample Data Time 0.00015	Per Sample DNN Time 0.06830	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 12916, running on 22c3b1e9ac95: starting (Fri Dec 12 12:34:16 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold4
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.

  with autocast():


Epoch: [1][50/646]	Per Sample Total Time 0.07451	Per Sample Data Time 0.00099	Per Sample DNN Time 0.07351	Train Loss 1.3749	
Epoch: [1][100/646]	Per Sample Total Time 0.07168	Per Sample Data Time 0.00052	Per Sample DNN Time 0.07116	Train Loss 0.9473	
Epoch: [1][150/646]	Per Sample Total Time 0.07096	Per Sample Data Time 0.00036	Per Sample DNN Time 0.07061	Train Loss 0.7587	
Epoch: [1][200/646]	Per Sample Total Time 0.07051	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07024	Train Loss 0.6484	
Epoch: [1][250/646]	Per Sample Total Time 0.07024	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07002	Train Loss 0.5723	
Epoch: [1][300/646]	Per Sample Total Time 0.07007	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06987	Train Loss 0.5147	
Epoch: [1][350/646]	Per Sample Total Time 0.06993	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06976	Train Loss 0.4736	
Epoch: [1][400/646]	Per Sample Total Time 0.06982	Per Sample Data Time 0.00015	Per Sample DNN Time 0.06967	Train Loss 0

  with autocast():


Epoch: [2][4/646]	Per Sample Total Time 0.07943	Per Sample Data Time 0.00955	Per Sample DNN Time 0.06988	Train Loss 0.1141	
Epoch: [2][54/646]	Per Sample Total Time 0.06984	Per Sample Data Time 0.00090	Per Sample DNN Time 0.06894	Train Loss 0.1119	
Epoch: [2][104/646]	Per Sample Total Time 0.06930	Per Sample Data Time 0.00049	Per Sample DNN Time 0.06881	Train Loss 0.0970	
Epoch: [2][154/646]	Per Sample Total Time 0.06914	Per Sample Data Time 0.00034	Per Sample DNN Time 0.06880	Train Loss 0.0827	
Epoch: [2][204/646]	Per Sample Total Time 0.06902	Per Sample Data Time 0.00027	Per Sample DNN Time 0.06875	Train Loss 0.0872	
Epoch: [2][254/646]	Per Sample Total Time 0.06898	Per Sample Data Time 0.00022	Per Sample DNN Time 0.06875	Train Loss 0.0923	
Epoch: [2][304/646]	Per Sample Total Time 0.06891	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06872	Train Loss 0.0894	
Epoch: [2][354/646]	Per Sample Total Time 0.06885	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06868	Train Loss 0.0

  with autocast():


Epoch: [3][8/646]	Per Sample Total Time 0.07432	Per Sample Data Time 0.00553	Per Sample DNN Time 0.06878	Train Loss 0.0602	
Epoch: [3][58/646]	Per Sample Total Time 0.06959	Per Sample Data Time 0.00087	Per Sample DNN Time 0.06872	Train Loss 0.0571	
Epoch: [3][108/646]	Per Sample Total Time 0.06907	Per Sample Data Time 0.00048	Per Sample DNN Time 0.06859	Train Loss 0.0436	
Epoch: [3][158/646]	Per Sample Total Time 0.06880	Per Sample Data Time 0.00034	Per Sample DNN Time 0.06846	Train Loss 0.0412	
Epoch: [3][208/646]	Per Sample Total Time 0.06868	Per Sample Data Time 0.00027	Per Sample DNN Time 0.06841	Train Loss 0.0467	
Epoch: [3][258/646]	Per Sample Total Time 0.06862	Per Sample Data Time 0.00022	Per Sample DNN Time 0.06840	Train Loss 0.0472	
Epoch: [3][308/646]	Per Sample Total Time 0.06854	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06835	Train Loss 0.0538	
Epoch: [3][358/646]	Per Sample Total Time 0.06849	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06832	Train Loss 0.0

  @autocast()
  scaler = GradScaler()


I am process 17174, running on 22c3b1e9ac95: starting (Fri Dec 12 13:02:31 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold5
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.

  with autocast():


Epoch: [1][50/650]	Per Sample Total Time 0.07456	Per Sample Data Time 0.00099	Per Sample DNN Time 0.07357	Train Loss 1.3771	
Epoch: [1][100/650]	Per Sample Total Time 0.07179	Per Sample Data Time 0.00051	Per Sample DNN Time 0.07128	Train Loss 0.9386	
Epoch: [1][150/650]	Per Sample Total Time 0.07100	Per Sample Data Time 0.00035	Per Sample DNN Time 0.07065	Train Loss 0.7561	
Epoch: [1][200/650]	Per Sample Total Time 0.07054	Per Sample Data Time 0.00027	Per Sample DNN Time 0.07026	Train Loss 0.6493	
Epoch: [1][250/650]	Per Sample Total Time 0.07031	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07008	Train Loss 0.5625	
Epoch: [1][300/650]	Per Sample Total Time 0.07012	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06993	Train Loss 0.5053	
Epoch: [1][350/650]	Per Sample Total Time 0.06998	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06981	Train Loss 0.4618	
Epoch: [1][400/650]	Per Sample Total Time 0.06987	Per Sample Data Time 0.00015	Per Sample DNN Time 0.06972	Train Loss 0

  with autocast():


Epoch: [2][0/650]	Per Sample Total Time 0.12411	Per Sample Data Time 0.04890	Per Sample DNN Time 0.07521	Train Loss 0.0274	
Epoch: [2][50/650]	Per Sample Total Time 0.06988	Per Sample Data Time 0.00099	Per Sample DNN Time 0.06889	Train Loss 0.0760	
Epoch: [2][100/650]	Per Sample Total Time 0.06924	Per Sample Data Time 0.00051	Per Sample DNN Time 0.06872	Train Loss 0.0695	
Epoch: [2][150/650]	Per Sample Total Time 0.06910	Per Sample Data Time 0.00035	Per Sample DNN Time 0.06875	Train Loss 0.0798	
Epoch: [2][200/650]	Per Sample Total Time 0.06894	Per Sample Data Time 0.00027	Per Sample DNN Time 0.06867	Train Loss 0.0817	
Epoch: [2][250/650]	Per Sample Total Time 0.06887	Per Sample Data Time 0.00023	Per Sample DNN Time 0.06864	Train Loss 0.0805	
Epoch: [2][300/650]	Per Sample Total Time 0.06882	Per Sample Data Time 0.00019	Per Sample DNN Time 0.06863	Train Loss 0.0828	
Epoch: [2][350/650]	Per Sample Total Time 0.06875	Per Sample Data Time 0.00017	Per Sample DNN Time 0.06858	Train Loss 0.0

  with autocast():


Epoch: [3][0/650]	Per Sample Total Time 0.13092	Per Sample Data Time 0.05805	Per Sample DNN Time 0.07288	Train Loss 0.1105	
Epoch: [3][50/650]	Per Sample Total Time 0.06980	Per Sample Data Time 0.00117	Per Sample DNN Time 0.06863	Train Loss 0.0613	
Epoch: [3][100/650]	Per Sample Total Time 0.06882	Per Sample Data Time 0.00061	Per Sample DNN Time 0.06822	Train Loss 0.0603	
Epoch: [3][150/650]	Per Sample Total Time 0.06861	Per Sample Data Time 0.00041	Per Sample DNN Time 0.06819	Train Loss 0.0698	
Epoch: [3][200/650]	Per Sample Total Time 0.06847	Per Sample Data Time 0.00032	Per Sample DNN Time 0.06815	Train Loss 0.0622	
Epoch: [3][250/650]	Per Sample Total Time 0.06838	Per Sample Data Time 0.00026	Per Sample DNN Time 0.06812	Train Loss 0.0589	
Epoch: [3][300/650]	Per Sample Total Time 0.06828	Per Sample Data Time 0.00022	Per Sample DNN Time 0.06805	Train Loss 0.0545	
Epoch: [3][350/650]	Per Sample Total Time 0.06824	Per Sample Data Time 0.00020	Per Sample DNN Time 0.06804	Train Loss 0.0