In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("HF_TOKEN")
# from huggingface_hub import HfApi, login
# from kaggle_secrets import UserSecretsClient
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import sys
import subprocess
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torchaudio


# AST Fine-tuning on UrbanSound8K (Kaggle version)
# - Auto-detect UrbanSound8K in /kaggle/input
# - Clone AST repo into /kaggle/working/ast
# - Download AudioSet checkpoint
# - 10-fold cross-validation training



# 0. ENV INFO & SEED
print("Listing /kaggle/input ...")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


# 1. INSTALL REQUIRED LIBRARIES (if needed)
import gc # <--- Import thư viện dọn rác

def pip_install(package):
    # Thêm "--no-cache-dir" để không ngốn RAM lưu file tạm
    # Thêm "-q" (quiet) để giảm log in ra màn hình
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "-q", package])

print("[*INFO] Installing libraries...")
# timm==0.4.5 là bản được AST repo dùng
pip_install("timm==0.4.5")
pip_install("wget")
pip_install("librosa")


print("[*INFO] Cleaning up memory after installation...")
gc.collect() 
if torch.cuda.is_available():
    torch.cuda.empty_cache()
# ======================================================

import timm
import wget

# 2. CLONE AST REPO VÀ IMPORT ASTModel
REPO_DIR = "/kaggle/working/ast"

if not os.path.exists(REPO_DIR):
    print("[*INFO] Cloning AST repo ...")
    subprocess.check_call(["git", "clone", "https://github.com/YuanGongND/ast", REPO_DIR])
else:
    print("[*INFO] AST repo already exists at", REPO_DIR)

sys.path.append(REPO_DIR)
os.chdir(REPO_DIR)

from src.models import ASTModel  


Listing /kaggle/input ...
Device: cuda
[*INFO] Installing libraries...
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 287.4/287.4 kB 9.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 252.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 171.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 102.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 274.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 247.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 255.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 198.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 156.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 216.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 226.2 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.


[*INFO] Cleaning up memory after installation...
[*INFO] Cloning AST repo ...


Cloning into '/kaggle/working/ast'...
  @autocast()


In [None]:

# 3. TÌM METADATA CSV UrbanSound8K
US8K_META_PATH = None
for root, dirs, files in os.walk("/kaggle/input"):
    if "UrbanSound8K.csv" in files:
        US8K_META_PATH = os.path.join(root, "UrbanSound8K.csv")
        break

if US8K_META_PATH is None:
    raise FileNotFoundError(
        "Không tìm thấy UrbanSound8K.csv trong /kaggle/input. "
        "Kiểm tra lại dataset UrbanSound8K đã add vào notebook."
    )

print("[*INFO] Found UrbanSound8K.csv at:", US8K_META_PATH)
METADATA_CSV = US8K_META_PATH

# 4. TÌM CHÍNH XÁC AUDIO_ROOT BẰNG CÁCH DÒ FILE THẬT

df_meta = pd.read_csv(METADATA_CSV)
sample_row = df_meta.iloc[0]
sample_fname = sample_row["slice_file_name"]
sample_fold = sample_row["fold"]
target_fold_dir = f"fold{sample_fold}"

print(f"[*INFO] Sample row -> fold={sample_fold}, fname={sample_fname}")

AUDIO_ROOT = None
for root, dirs, files in os.walk("/kaggle/input"):
    if os.path.basename(root) == target_fold_dir and sample_fname in files:
        # root = .../audio/foldX
        AUDIO_ROOT = os.path.dirname(root)  # bỏ /foldX
        break

if AUDIO_ROOT is None:
    raise FileNotFoundError(
        f"Không tìm thấy thư mục audio chứa fold{sample_fold} và file {sample_fname} trong /kaggle/input."
    )

print("[*INFO] AUDIO_ROOT detected as:", AUDIO_ROOT)

# Thư mục lưu checkpoint
CKPT_DIR = "/kaggle/working/ast_us8k_checkpoints"
os.makedirs(CKPT_DIR, exist_ok=True)

[*INFO] Found UrbanSound8K.csv at: /kaggle/input/urbansound8k/UrbanSound8K.csv
[*INFO] Sample row -> fold=5, fname=100032-3-0-0.wav
[*INFO] AUDIO_ROOT detected as: /kaggle/input/urbansound8k


In [3]:
print(AUDIO_ROOT)

/kaggle/input/urbansound8k


new code

In [4]:
from pathlib import Path
try:
    REPO_DIR
except NameError:
    REPO_DIR = Path('.').resolve()
else:
    REPO_DIR = Path(REPO_DIR)
TORCH_HOME = REPO_DIR / 'pretrained_models'
os.environ['TORCH_HOME'] = str(TORCH_HOME)
TORCH_HOME.mkdir(exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Repo root:', REPO_DIR)
print('Device:', device)

Repo root: /kaggle/working/ast
Device: cuda


In [5]:
# Build label CSV (index, mid, display_name)
meta_df = pd.read_csv(METADATA_CSV)
label_csv = REPO_DIR / 'urban8k_data' / 'urban8k_class_labels_indices.csv'
label_csv.parent.mkdir(exist_ok=True)

unique_classes = meta_df[['classID', 'class']].drop_duplicates().sort_values('classID')
with open(label_csv, 'w') as f:
    f.write('index,mid,display_name\n')
    for _, row in unique_classes.iterrows():
        idx = int(row['classID'])
        mid = f"/m/urban{idx:02d}"
        name = row['class']
        f.write(f"{idx},{mid},{name}\n")

print('Label CSV written to:', label_csv)


Label CSV written to: /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv


In [6]:
import json, shutil

# Create JSON datafiles per fold (ESC-50 style)
data_dir = REPO_DIR / 'urban8k_data' / 'datafiles'
data_dir.mkdir(parents=True, exist_ok=True)

def rows_for_df(df):
    rows = []
    for _, r in df.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        rows.append({
            'wav': str(wav_path),
            'labels': f"/m/urban{int(r['classID']):02d}"
        })
    return rows

for fold in range(1, 11):
    train_df = meta_df[meta_df['fold'] != fold]
    eval_df = meta_df[meta_df['fold'] == fold]
    train_json = {'data': rows_for_df(train_df)}
    eval_json = {'data': rows_for_df(eval_df)}
    with open(data_dir / f'urban_train_data_{fold}.json', 'w') as f:
        json.dump(train_json, f, indent=1)
    with open(data_dir / f'urban_eval_data_{fold}.json', 'w') as f:
        json.dump(eval_json, f, indent=1)
    print(f'Fold {fold}: train {len(train_df)}, eval {len(eval_df)}')


Fold 1: train 7859, eval 873
Fold 2: train 7844, eval 888
Fold 3: train 7807, eval 925
Fold 4: train 7742, eval 990
Fold 5: train 7796, eval 936
Fold 6: train 7909, eval 823
Fold 7: train 7894, eval 838
Fold 8: train 7926, eval 806
Fold 9: train 7916, eval 816
Fold 10: train 7895, eval 837


In [None]:
# Compute dataset mean/std over fbank features (optional; default uses AudioSet stats)
import torchaudio

def compute_norm_stats(df, target_length=1024, mel_bins=128, target_sr=16000, max_files=None):
    total = 0.0
    total_sq = 0.0
    count = 0
    rows = df if max_files is None else df.sample(n=max_files, random_state=SEED)
    for _, r in rows.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        wav, sr = torchaudio.load(str(wav_path))
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if sr != target_sr:
            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
            sr = target_sr
        fb = torchaudio.compliance.kaldi.fbank(
            wav, htk_compat=True, sample_frequency=sr, use_energy=False,
            window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10
        )
        p = target_length - fb.shape[0]
        if p > 0:
            fb = torch.nn.functional.pad(fb, (0, 0, 0, p))
        elif p < 0:
            fb = fb[:target_length, :]
        total += fb.sum().item()
        total_sq += (fb ** 2).sum().item()
        count += fb.numel()
    mean = total / count
    var = total_sq / count - mean ** 2
    std = var ** 0.5
    return mean, std


try:
    DATASET_MEAN
    DATASET_STD
except NameError:
    DATASET_MEAN = -1.7362523965764032
    DATASET_STD = 3.2758855893221015
print('Using mean/std:', DATASET_MEAN, DATASET_STD)


Using mean/std: -1.7362523965764032 3.2758855893221015


In [8]:
REPO_DIR = Path('.').resolve()
PRETRAIN_DIR = (REPO_DIR / '../../pretrained_models').resolve()
PRETRAIN_DIR.mkdir(parents=True, exist_ok=True)
os.environ['TORCH_HOME'] = str(PRETRAIN_DIR)
print('TORCH_HOME:', PRETRAIN_DIR)
# Helper to launch training via src/run.py with ESC-50-like hyperparameters
def run_fold(fold, epochs=3, batch_size=12, lr=1e-5):
    exp_dir = REPO_DIR / 'urban8k_exp' / f'fold{fold}'
    if exp_dir.exists():
        shutil.rmtree(exp_dir)
    exp_dir.mkdir(parents=True, exist_ok=True)
    train_json = data_dir / f'urban_train_data_{fold}.json'
    eval_json = data_dir / f'urban_eval_data_{fold}.json'

    cmd = [
        sys.executable, 'src/run.py',
        '--model', 'ast',
        '--dataset', 'urban8k',
        '--data-train', str(train_json),
        '--data-val', str(eval_json),
        '--exp-dir', str(exp_dir),
        '--label-csv', str(label_csv),
        '--n_class', '10',
        '--lr', str(lr),
        '--n-epochs', str(epochs),
        '--batch-size', str(batch_size),
        '--save_model', 'False',
        '--freqm', '24',
        '--timem', '96',
        '--mixup', '0',
        '--bal', 'none',
        '--tstride', '10', '--fstride', '10',
        '--imagenet_pretrain', 'True',
        '--audioset_pretrain', 'False',
        '--metrics', 'acc',
        '--loss', 'CE',
        '--warmup', 'False',
        '--lrscheduler_start', '5',
        '--lrscheduler_step', '1',
        '--lrscheduler_decay', '0.85',
        '--dataset_mean', str(DATASET_MEAN),
        '--dataset_std', str(DATASET_STD),
        '--audio_length', '1024',
        '--noise', 'False',
        '--num-workers', '4',
        '--n-print-steps', '50'
    ]
    env = os.environ.copy()
    env['TORCH_HOME'] = str(TORCH_HOME)
    print(f'Running fold {fold}:', ' '.join(cmd))
    subprocess.check_call(cmd, env=env, cwd=REPO_DIR)

print('Helper ready: run_fold(fold) to launch training')



TORCH_HOME: /kaggle/pretrained_models
Helper ready: run_fold(fold) to launch training


In [9]:
# Kick off full 10-fold: each fold is held out as test once
folds_to_run = list(range(1, 6))
for f in folds_to_run:
    run_fold(f)

# Aggregate best validation accuracy per fold (col 0 of result.csv)
fold_acc = []
for f in folds_to_run:
    res_path = REPO_DIR / 'urban8k_exp' / f'fold{f}' / 'result.csv'
    arr = np.loadtxt(res_path, delimiter=',')
    if arr.ndim == 1:
        arr = arr[None, :]
    best_acc = float(arr[:, 0].max())
    fold_acc.append(best_acc)
    print(f'Fold {f}: best val acc {best_acc:.4f}')

mean_acc = float(np.mean(fold_acc))
std_acc = float(np.std(fold_acc))
print(f'10-fold mean acc: {mean_acc:.4f} +/- {std_acc:.4f}')


Running fold 1: /usr/bin/python3 src/run.py --model ast --dataset urban8k --data-train /kaggle/working/ast/urban8k_data/datafiles/urban_train_data_1.json --data-val /kaggle/working/ast/urban8k_data/datafiles/urban_eval_data_1.json --exp-dir /kaggle/working/ast/urban8k_exp/fold1 --label-csv /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv --n_class 10 --lr 1e-05 --n-epochs 3 --batch-size 12 --save_model False --freqm 24 --timem 96 --mixup 0 --bal none --tstride 10 --fstride 10 --imagenet_pretrain True --audioset_pretrain False --metrics acc --loss CE --warmup False --lrscheduler_start 5 --lrscheduler_step 1 --lrscheduler_decay 0.85 --dataset_mean -1.7362523965764032 --dataset_std 3.2758855893221015 --audio_length 1024 --noise False --num-workers 4 --n-print-steps 50


  @autocast()
Downloading: "https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth" to ../../pretrained_models/hub/checkpoints/deit_base_distilled_patch16_384-d0272ac0.pth
  scaler = GradScaler()


I am process 88, running on b3680011ca52: starting (Fri Dec 12 12:02:04 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold1
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.73

  with autocast():


Epoch: [1][50/655]	Per Sample Total Time 0.06889	Per Sample Data Time 0.00141	Per Sample DNN Time 0.06748	Train Loss 1.8805	
Epoch: [1][100/655]	Per Sample Total Time 0.06818	Per Sample Data Time 0.00073	Per Sample DNN Time 0.06746	Train Loss 1.5086	
Epoch: [1][150/655]	Per Sample Total Time 0.06980	Per Sample Data Time 0.00050	Per Sample DNN Time 0.06930	Train Loss 1.2702	
Epoch: [1][200/655]	Per Sample Total Time 0.07152	Per Sample Data Time 0.00038	Per Sample DNN Time 0.07114	Train Loss 1.1106	
Epoch: [1][250/655]	Per Sample Total Time 0.07217	Per Sample Data Time 0.00031	Per Sample DNN Time 0.07186	Train Loss 1.0002	
Epoch: [1][300/655]	Per Sample Total Time 0.07269	Per Sample Data Time 0.00027	Per Sample DNN Time 0.07242	Train Loss 0.9039	
Epoch: [1][350/655]	Per Sample Total Time 0.07295	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07272	Train Loss 0.8427	
Epoch: [1][400/655]	Per Sample Total Time 0.07318	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07297	Train Loss 0

  with autocast():


Epoch: [2][45/655]	Per Sample Total Time 0.07562	Per Sample Data Time 0.00115	Per Sample DNN Time 0.07447	Train Loss 0.2153	
Epoch: [2][95/655]	Per Sample Total Time 0.07487	Per Sample Data Time 0.00057	Per Sample DNN Time 0.07430	Train Loss 0.2227	
Epoch: [2][145/655]	Per Sample Total Time 0.07478	Per Sample Data Time 0.00039	Per Sample DNN Time 0.07439	Train Loss 0.2120	
Epoch: [2][195/655]	Per Sample Total Time 0.07465	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07435	Train Loss 0.1982	
Epoch: [2][245/655]	Per Sample Total Time 0.07457	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07433	Train Loss 0.1901	
Epoch: [2][295/655]	Per Sample Total Time 0.07454	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07433	Train Loss 0.1910	
Epoch: [2][345/655]	Per Sample Total Time 0.07451	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07433	Train Loss 0.1851	
Epoch: [2][395/655]	Per Sample Total Time 0.07448	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07431	Train Loss 0.

  with autocast():


Epoch: [3][40/655]	Per Sample Total Time 0.07588	Per Sample Data Time 0.00127	Per Sample DNN Time 0.07461	Train Loss 0.0932	
Epoch: [3][90/655]	Per Sample Total Time 0.07469	Per Sample Data Time 0.00059	Per Sample DNN Time 0.07410	Train Loss 0.0901	
Epoch: [3][140/655]	Per Sample Total Time 0.07439	Per Sample Data Time 0.00039	Per Sample DNN Time 0.07399	Train Loss 0.0953	
Epoch: [3][190/655]	Per Sample Total Time 0.07433	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07403	Train Loss 0.0938	
Epoch: [3][240/655]	Per Sample Total Time 0.07430	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07406	Train Loss 0.0907	
Epoch: [3][290/655]	Per Sample Total Time 0.07428	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07407	Train Loss 0.0942	
Epoch: [3][340/655]	Per Sample Total Time 0.07426	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07408	Train Loss 0.0961	
Epoch: [3][390/655]	Per Sample Total Time 0.07425	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07409	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 4371, running on b3680011ca52: starting (Fri Dec 12 12:32:41 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold2
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.

  with autocast():


Epoch: [1][50/654]	Per Sample Total Time 0.07961	Per Sample Data Time 0.00112	Per Sample DNN Time 0.07850	Train Loss 1.8145	
Epoch: [1][100/654]	Per Sample Total Time 0.07705	Per Sample Data Time 0.00058	Per Sample DNN Time 0.07647	Train Loss 1.4486	
Epoch: [1][150/654]	Per Sample Total Time 0.07652	Per Sample Data Time 0.00040	Per Sample DNN Time 0.07612	Train Loss 1.2559	
Epoch: [1][200/654]	Per Sample Total Time 0.07604	Per Sample Data Time 0.00031	Per Sample DNN Time 0.07573	Train Loss 1.1082	
Epoch: [1][250/654]	Per Sample Total Time 0.07586	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07560	Train Loss 1.0011	
Epoch: [1][300/654]	Per Sample Total Time 0.07568	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07547	Train Loss 0.9066	
Epoch: [1][350/654]	Per Sample Total Time 0.07557	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07538	Train Loss 0.8389	
Epoch: [1][400/654]	Per Sample Total Time 0.07549	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07532	Train Loss 0

  with autocast():


Epoch: [2][46/654]	Per Sample Total Time 0.07592	Per Sample Data Time 0.00106	Per Sample DNN Time 0.07486	Train Loss 0.2260	
Epoch: [2][96/654]	Per Sample Total Time 0.07503	Per Sample Data Time 0.00053	Per Sample DNN Time 0.07449	Train Loss 0.2051	
Epoch: [2][146/654]	Per Sample Total Time 0.07494	Per Sample Data Time 0.00037	Per Sample DNN Time 0.07457	Train Loss 0.2089	
Epoch: [2][196/654]	Per Sample Total Time 0.07484	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07456	Train Loss 0.2052	
Epoch: [2][246/654]	Per Sample Total Time 0.07476	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07453	Train Loss 0.1956	
Epoch: [2][296/654]	Per Sample Total Time 0.07471	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07452	Train Loss 0.1924	
Epoch: [2][346/654]	Per Sample Total Time 0.07471	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07454	Train Loss 0.1942	
Epoch: [2][396/654]	Per Sample Total Time 0.07468	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07452	Train Loss 0.

  with autocast():


Epoch: [3][42/654]	Per Sample Total Time 0.07552	Per Sample Data Time 0.00125	Per Sample DNN Time 0.07427	Train Loss 0.1028	
Epoch: [3][92/654]	Per Sample Total Time 0.07485	Per Sample Data Time 0.00060	Per Sample DNN Time 0.07425	Train Loss 0.0960	
Epoch: [3][142/654]	Per Sample Total Time 0.07458	Per Sample Data Time 0.00040	Per Sample DNN Time 0.07418	Train Loss 0.0936	
Epoch: [3][192/654]	Per Sample Total Time 0.07458	Per Sample Data Time 0.00031	Per Sample DNN Time 0.07428	Train Loss 0.1041	
Epoch: [3][242/654]	Per Sample Total Time 0.07453	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07428	Train Loss 0.1119	
Epoch: [3][292/654]	Per Sample Total Time 0.07450	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07429	Train Loss 0.1076	
Epoch: [3][342/654]	Per Sample Total Time 0.07449	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07430	Train Loss 0.1040	
Epoch: [3][392/654]	Per Sample Total Time 0.07446	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07429	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 8647, running on b3680011ca52: starting (Fri Dec 12 13:03:32 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold3
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.

  with autocast():


Epoch: [1][50/651]	Per Sample Total Time 0.07978	Per Sample Data Time 0.00098	Per Sample DNN Time 0.07880	Train Loss 1.7638	
Epoch: [1][100/651]	Per Sample Total Time 0.07692	Per Sample Data Time 0.00051	Per Sample DNN Time 0.07641	Train Loss 1.4255	
Epoch: [1][150/651]	Per Sample Total Time 0.07647	Per Sample Data Time 0.00035	Per Sample DNN Time 0.07612	Train Loss 1.2187	
Epoch: [1][200/651]	Per Sample Total Time 0.07596	Per Sample Data Time 0.00027	Per Sample DNN Time 0.07569	Train Loss 1.0632	
Epoch: [1][250/651]	Per Sample Total Time 0.07578	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07555	Train Loss 0.9546	
Epoch: [1][300/651]	Per Sample Total Time 0.07559	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07540	Train Loss 0.8681	
Epoch: [1][350/651]	Per Sample Total Time 0.07549	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07532	Train Loss 0.8142	
Epoch: [1][400/651]	Per Sample Total Time 0.07539	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07523	Train Loss 0

  with autocast():


Epoch: [2][49/651]	Per Sample Total Time 0.07559	Per Sample Data Time 0.00101	Per Sample DNN Time 0.07458	Train Loss 0.1568	
Epoch: [2][99/651]	Per Sample Total Time 0.07512	Per Sample Data Time 0.00052	Per Sample DNN Time 0.07460	Train Loss 0.1780	
Epoch: [2][149/651]	Per Sample Total Time 0.07496	Per Sample Data Time 0.00036	Per Sample DNN Time 0.07460	Train Loss 0.1887	
Epoch: [2][199/651]	Per Sample Total Time 0.07480	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07452	Train Loss 0.2003	
Epoch: [2][249/651]	Per Sample Total Time 0.07474	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07451	Train Loss 0.1896	
Epoch: [2][299/651]	Per Sample Total Time 0.07468	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07448	Train Loss 0.1955	
Epoch: [2][349/651]	Per Sample Total Time 0.07465	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07447	Train Loss 0.1969	
Epoch: [2][399/651]	Per Sample Total Time 0.07462	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07446	Train Loss 0.

  with autocast():


Epoch: [3][48/651]	Per Sample Total Time 0.07524	Per Sample Data Time 0.00113	Per Sample DNN Time 0.07411	Train Loss 0.0705	
Epoch: [3][98/651]	Per Sample Total Time 0.07456	Per Sample Data Time 0.00058	Per Sample DNN Time 0.07399	Train Loss 0.0872	
Epoch: [3][148/651]	Per Sample Total Time 0.07437	Per Sample Data Time 0.00040	Per Sample DNN Time 0.07398	Train Loss 0.0874	
Epoch: [3][198/651]	Per Sample Total Time 0.07438	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07408	Train Loss 0.0823	
Epoch: [3][248/651]	Per Sample Total Time 0.07435	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07410	Train Loss 0.0857	
Epoch: [3][298/651]	Per Sample Total Time 0.07434	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07412	Train Loss 0.0947	
Epoch: [3][348/651]	Per Sample Total Time 0.07430	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07412	Train Loss 0.0992	
Epoch: [3][398/651]	Per Sample Total Time 0.07427	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07410	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 12917, running on b3680011ca52: starting (Fri Dec 12 13:34:15 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold4
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87

  with autocast():


Epoch: [1][50/646]	Per Sample Total Time 0.08019	Per Sample Data Time 0.00097	Per Sample DNN Time 0.07922	Train Loss 1.8138	
Epoch: [1][100/646]	Per Sample Total Time 0.07711	Per Sample Data Time 0.00051	Per Sample DNN Time 0.07661	Train Loss 1.4509	
Epoch: [1][150/646]	Per Sample Total Time 0.07655	Per Sample Data Time 0.00035	Per Sample DNN Time 0.07620	Train Loss 1.2276	
Epoch: [1][200/646]	Per Sample Total Time 0.07606	Per Sample Data Time 0.00027	Per Sample DNN Time 0.07578	Train Loss 1.0903	
Epoch: [1][250/646]	Per Sample Total Time 0.07587	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07564	Train Loss 0.9795	
Epoch: [1][300/646]	Per Sample Total Time 0.07570	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07550	Train Loss 0.8967	
Epoch: [1][350/646]	Per Sample Total Time 0.07555	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07538	Train Loss 0.8265	
Epoch: [1][400/646]	Per Sample Total Time 0.07544	Per Sample Data Time 0.00015	Per Sample DNN Time 0.07529	Train Loss 0

  with autocast():


Epoch: [2][4/646]	Per Sample Total Time 0.08559	Per Sample Data Time 0.01106	Per Sample DNN Time 0.07453	Train Loss 0.1916	
Epoch: [2][54/646]	Per Sample Total Time 0.07546	Per Sample Data Time 0.00104	Per Sample DNN Time 0.07442	Train Loss 0.1824	
Epoch: [2][104/646]	Per Sample Total Time 0.07479	Per Sample Data Time 0.00056	Per Sample DNN Time 0.07423	Train Loss 0.1903	
Epoch: [2][154/646]	Per Sample Total Time 0.07476	Per Sample Data Time 0.00039	Per Sample DNN Time 0.07436	Train Loss 0.1884	
Epoch: [2][204/646]	Per Sample Total Time 0.07464	Per Sample Data Time 0.00031	Per Sample DNN Time 0.07433	Train Loss 0.1875	
Epoch: [2][254/646]	Per Sample Total Time 0.07459	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07433	Train Loss 0.1908	
Epoch: [2][304/646]	Per Sample Total Time 0.07457	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07435	Train Loss 0.1850	
Epoch: [2][354/646]	Per Sample Total Time 0.07455	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07435	Train Loss 0.1

  with autocast():


Epoch: [3][8/646]	Per Sample Total Time 0.08134	Per Sample Data Time 0.00729	Per Sample DNN Time 0.07405	Train Loss 0.1347	
Epoch: [3][58/646]	Per Sample Total Time 0.07526	Per Sample Data Time 0.00114	Per Sample DNN Time 0.07412	Train Loss 0.1061	
Epoch: [3][108/646]	Per Sample Total Time 0.07469	Per Sample Data Time 0.00064	Per Sample DNN Time 0.07405	Train Loss 0.1006	
Epoch: [3][158/646]	Per Sample Total Time 0.07451	Per Sample Data Time 0.00045	Per Sample DNN Time 0.07406	Train Loss 0.1008	
Epoch: [3][208/646]	Per Sample Total Time 0.07438	Per Sample Data Time 0.00035	Per Sample DNN Time 0.07403	Train Loss 0.1027	
Epoch: [3][258/646]	Per Sample Total Time 0.07436	Per Sample Data Time 0.00029	Per Sample DNN Time 0.07407	Train Loss 0.0960	
Epoch: [3][308/646]	Per Sample Total Time 0.07432	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07408	Train Loss 0.0965	
Epoch: [3][358/646]	Per Sample Total Time 0.07428	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07407	Train Loss 0.0

  @autocast()
  scaler = GradScaler()


I am process 17175, running on b3680011ca52: starting (Fri Dec 12 14:04:49 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold5
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87

  with autocast():


Epoch: [1][50/650]	Per Sample Total Time 0.08000	Per Sample Data Time 0.00087	Per Sample DNN Time 0.07913	Train Loss 1.9684	
Epoch: [1][100/650]	Per Sample Total Time 0.07703	Per Sample Data Time 0.00046	Per Sample DNN Time 0.07657	Train Loss 1.6101	
Epoch: [1][150/650]	Per Sample Total Time 0.07653	Per Sample Data Time 0.00032	Per Sample DNN Time 0.07622	Train Loss 1.3421	
Epoch: [1][200/650]	Per Sample Total Time 0.07604	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07579	Train Loss 1.2039	
Epoch: [1][250/650]	Per Sample Total Time 0.07587	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07566	Train Loss 1.0760	
Epoch: [1][300/650]	Per Sample Total Time 0.07567	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07549	Train Loss 0.9864	
Epoch: [1][350/650]	Per Sample Total Time 0.07555	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07540	Train Loss 0.9217	
Epoch: [1][400/650]	Per Sample Total Time 0.07544	Per Sample Data Time 0.00014	Per Sample DNN Time 0.07530	Train Loss 0

  with autocast():


Epoch: [2][0/650]	Per Sample Total Time 0.13208	Per Sample Data Time 0.05388	Per Sample DNN Time 0.07820	Train Loss 0.1619	
Epoch: [2][50/650]	Per Sample Total Time 0.07542	Per Sample Data Time 0.00109	Per Sample DNN Time 0.07432	Train Loss 0.2406	
Epoch: [2][100/650]	Per Sample Total Time 0.07497	Per Sample Data Time 0.00057	Per Sample DNN Time 0.07440	Train Loss 0.2342	
Epoch: [2][150/650]	Per Sample Total Time 0.07476	Per Sample Data Time 0.00039	Per Sample DNN Time 0.07437	Train Loss 0.2343	
Epoch: [2][200/650]	Per Sample Total Time 0.07466	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07435	Train Loss 0.2391	
Epoch: [2][250/650]	Per Sample Total Time 0.07460	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07435	Train Loss 0.2378	
Epoch: [2][300/650]	Per Sample Total Time 0.07456	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07435	Train Loss 0.2368	
Epoch: [2][350/650]	Per Sample Total Time 0.07452	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07433	Train Loss 0.2

  with autocast():


Epoch: [3][0/650]	Per Sample Total Time 0.13107	Per Sample Data Time 0.05024	Per Sample DNN Time 0.08083	Train Loss 0.1716	
Epoch: [3][50/650]	Per Sample Total Time 0.07549	Per Sample Data Time 0.00102	Per Sample DNN Time 0.07447	Train Loss 0.1416	
Epoch: [3][100/650]	Per Sample Total Time 0.07455	Per Sample Data Time 0.00054	Per Sample DNN Time 0.07402	Train Loss 0.1398	
Epoch: [3][150/650]	Per Sample Total Time 0.07446	Per Sample Data Time 0.00037	Per Sample DNN Time 0.07409	Train Loss 0.1342	
Epoch: [3][200/650]	Per Sample Total Time 0.07438	Per Sample Data Time 0.00029	Per Sample DNN Time 0.07409	Train Loss 0.1275	
Epoch: [3][250/650]	Per Sample Total Time 0.07440	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07417	Train Loss 0.1212	
Epoch: [3][300/650]	Per Sample Total Time 0.07439	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07418	Train Loss 0.1221	
Epoch: [3][350/650]	Per Sample Total Time 0.07436	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07418	Train Loss 0.1