In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("HF_TOKEN")
# from huggingface_hub import HfApi, login
# from kaggle_secrets import UserSecretsClient
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import sys
import subprocess
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torchaudio


# AST Fine-tuning on UrbanSound8K (Kaggle version)
# - Auto-detect UrbanSound8K in /kaggle/input
# - Clone AST repo into /kaggle/working/ast
# - Download AudioSet checkpoint
# - 10-fold cross-validation training



# 0. ENV INFO & SEED
print("Listing /kaggle/input ...")


SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)



# 1. INSTALL REQUIRED LIBRARIES (if needed)
import gc # <--- Import thư viện dọn rác

def pip_install(package):
    # Thêm "--no-cache-dir" để không ngốn RAM lưu file tạm
    # Thêm "-q" (quiet) để giảm log in ra màn hình
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "-q", package])

print("[*INFO] Installing libraries...")
# timm==0.4.5 là bản được AST repo dùng
pip_install("timm==0.4.5")
pip_install("wget")
pip_install("librosa")

print("[*INFO] Cleaning up memory after installation...")
gc.collect() 
if torch.cuda.is_available():
    torch.cuda.empty_cache()
# ======================================================

import timm
import wget

# 2. CLONE AST REPO VÀ IMPORT ASTModel
REPO_DIR = "/kaggle/working/ast"

if not os.path.exists(REPO_DIR):
    print("[*INFO] Cloning AST repo ...")
    subprocess.check_call(["git", "clone", "https://github.com/YuanGongND/ast", REPO_DIR])
else:
    print("[*INFO] AST repo already exists at", REPO_DIR)

sys.path.append(REPO_DIR)
os.chdir(REPO_DIR)

from src.models import ASTModel  # sau khi đã sys.path.append


Listing /kaggle/input ...
Device: cuda
[*INFO] Installing libraries...
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 287.4/287.4 kB 11.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 277.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 276.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 270.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 392.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 300.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 308.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 299.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 299.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 242.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 301.7 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.


[*INFO] Cleaning up memory after installation...
[*INFO] Cloning AST repo ...


Cloning into '/kaggle/working/ast'...
  @autocast()


In [None]:

# 3. TÌM METADATA CSV UrbanSound8K
US8K_META_PATH = None
for root, dirs, files in os.walk("/kaggle/input"):
    if "UrbanSound8K.csv" in files:
        US8K_META_PATH = os.path.join(root, "UrbanSound8K.csv")
        break

if US8K_META_PATH is None:
    raise FileNotFoundError(
        "Không tìm thấy UrbanSound8K.csv trong /kaggle/input. "
        "Kiểm tra lại dataset UrbanSound8K đã add vào notebook."
    )

print("[*INFO] Found UrbanSound8K.csv at:", US8K_META_PATH)
METADATA_CSV = US8K_META_PATH

# 4. TÌM CHÍNH XÁC AUDIO_ROOT BẰNG CÁCH DÒ FILE THẬT
df_meta = pd.read_csv(METADATA_CSV)
sample_row = df_meta.iloc[0]
sample_fname = sample_row["slice_file_name"]
sample_fold = sample_row["fold"]
target_fold_dir = f"fold{sample_fold}"

print(f"[*INFO] Sample row -> fold={sample_fold}, fname={sample_fname}")

AUDIO_ROOT = None
for root, dirs, files in os.walk("/kaggle/input"):
    if os.path.basename(root) == target_fold_dir and sample_fname in files:
        # root = .../audio/foldX
        AUDIO_ROOT = os.path.dirname(root)  # bỏ /foldX
        break

if AUDIO_ROOT is None:
    raise FileNotFoundError(
        f"Không tìm thấy thư mục audio chứa fold{sample_fold} và file {sample_fname} trong /kaggle/input."
    )

print("[*INFO] AUDIO_ROOT detected as:", AUDIO_ROOT)

# Thư mục lưu checkpoint
CKPT_DIR = "/kaggle/working/ast_us8k_checkpoints"
os.makedirs(CKPT_DIR, exist_ok=True)

[*INFO] Found UrbanSound8K.csv at: /kaggle/input/urbansound8k/UrbanSound8K.csv
[*INFO] Sample row -> fold=5, fname=100032-3-0-0.wav
[*INFO] AUDIO_ROOT detected as: /kaggle/input/urbansound8k


In [3]:
print(AUDIO_ROOT)

/kaggle/input/urbansound8k


new code

In [4]:
from pathlib import Path
try:
    REPO_DIR
except NameError:
    REPO_DIR = Path('.').resolve()
else:
    REPO_DIR = Path(REPO_DIR)
TORCH_HOME = REPO_DIR / 'pretrained_models'
os.environ['TORCH_HOME'] = str(TORCH_HOME)
TORCH_HOME.mkdir(exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Repo root:', REPO_DIR)
print('Device:', device)

Repo root: /kaggle/working/ast
Device: cuda


In [5]:
# Build label CSV (index, mid, display_name)
meta_df = pd.read_csv(METADATA_CSV)
label_csv = REPO_DIR / 'urban8k_data' / 'urban8k_class_labels_indices.csv'
label_csv.parent.mkdir(exist_ok=True)

unique_classes = meta_df[['classID', 'class']].drop_duplicates().sort_values('classID')
with open(label_csv, 'w') as f:
    f.write('index,mid,display_name\n')
    for _, row in unique_classes.iterrows():
        idx = int(row['classID'])
        mid = f"/m/urban{idx:02d}"
        name = row['class']
        f.write(f"{idx},{mid},{name}\n")

print('Label CSV written to:', label_csv)


Label CSV written to: /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv


In [6]:
import json, shutil

# Create JSON datafiles per fold (ESC-50 style)
data_dir = REPO_DIR / 'urban8k_data' / 'datafiles'
data_dir.mkdir(parents=True, exist_ok=True)

def rows_for_df(df):
    rows = []
    for _, r in df.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        rows.append({
            'wav': str(wav_path),
            'labels': f"/m/urban{int(r['classID']):02d}"
        })
    return rows

for fold in range(1, 11):
    train_df = meta_df[meta_df['fold'] != fold]
    eval_df = meta_df[meta_df['fold'] == fold]
    train_json = {'data': rows_for_df(train_df)}
    eval_json = {'data': rows_for_df(eval_df)}
    with open(data_dir / f'urban_train_data_{fold}.json', 'w') as f:
        json.dump(train_json, f, indent=1)
    with open(data_dir / f'urban_eval_data_{fold}.json', 'w') as f:
        json.dump(eval_json, f, indent=1)
    print(f'Fold {fold}: train {len(train_df)}, eval {len(eval_df)}')


Fold 1: train 7859, eval 873
Fold 2: train 7844, eval 888
Fold 3: train 7807, eval 925
Fold 4: train 7742, eval 990
Fold 5: train 7796, eval 936
Fold 6: train 7909, eval 823
Fold 7: train 7894, eval 838
Fold 8: train 7926, eval 806
Fold 9: train 7916, eval 816
Fold 10: train 7895, eval 837


In [None]:
# Compute dataset mean/std over fbank features (optional; default uses AudioSet stats)
import torchaudio

def compute_norm_stats(df, target_length=1024, mel_bins=128, target_sr=16000, max_files=None):
    total = 0.0
    total_sq = 0.0
    count = 0
    rows = df if max_files is None else df.sample(n=max_files, random_state=SEED)
    for _, r in rows.iterrows():
        wav_path = Path(AUDIO_ROOT) / f"fold{int(r['fold'])}" / r['slice_file_name']
        wav, sr = torchaudio.load(str(wav_path))
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if sr != target_sr:
            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
            sr = target_sr
        fb = torchaudio.compliance.kaldi.fbank(
            wav, htk_compat=True, sample_frequency=sr, use_energy=False,
            window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10
        )
        p = target_length - fb.shape[0]
        if p > 0:
            fb = torch.nn.functional.pad(fb, (0, 0, 0, p))
        elif p < 0:
            fb = fb[:target_length, :]
        total += fb.sum().item()
        total_sq += (fb ** 2).sum().item()
        count += fb.numel()
    mean = total / count
    var = total_sq / count - mean ** 2
    std = var ** 0.5
    return mean, std


try:
    DATASET_MEAN
    DATASET_STD
except NameError:
    DATASET_MEAN = -1.7362523965764032
    DATASET_STD = 3.2758855893221015
print('Using mean/std:', DATASET_MEAN, DATASET_STD)


Using mean/std: -1.7362523965764032 3.2758855893221015


In [8]:
REPO_DIR = Path('.').resolve()
PRETRAIN_DIR = (REPO_DIR / '../../pretrained_models').resolve()
PRETRAIN_DIR.mkdir(parents=True, exist_ok=True)
os.environ['TORCH_HOME'] = str(PRETRAIN_DIR)
print('TORCH_HOME:', PRETRAIN_DIR)
# Helper to launch training via src/run.py with ESC-50-like hyperparameters
def run_fold(fold, epochs=3, batch_size=12, lr=1e-5):
    exp_dir = REPO_DIR / 'urban8k_exp' / f'fold{fold}'
    if exp_dir.exists():
        shutil.rmtree(exp_dir)
    exp_dir.mkdir(parents=True, exist_ok=True)
    train_json = data_dir / f'urban_train_data_{fold}.json'
    eval_json = data_dir / f'urban_eval_data_{fold}.json'

    cmd = [
        sys.executable, 'src/run.py',
        '--model', 'ast',
        '--dataset', 'urban8k',
        '--data-train', str(train_json),
        '--data-val', str(eval_json),
        '--exp-dir', str(exp_dir),
        '--label-csv', str(label_csv),
        '--n_class', '10',
        '--lr', str(lr),
        '--n-epochs', str(epochs),
        '--batch-size', str(batch_size),
        '--save_model', 'False',
        '--freqm', '24',
        '--timem', '96',
        '--mixup', '0',
        '--bal', 'none',
        '--tstride', '10', '--fstride', '10',
        '--imagenet_pretrain', 'True',
        '--audioset_pretrain', 'True',
        '--metrics', 'acc',
        '--loss', 'CE',
        '--warmup', 'False',
        '--lrscheduler_start', '5',
        '--lrscheduler_step', '1',
        '--lrscheduler_decay', '0.85',
        '--dataset_mean', str(DATASET_MEAN),
        '--dataset_std', str(DATASET_STD),
        '--audio_length', '1024',
        '--noise', 'False',
        '--num-workers', '4',
        '--n-print-steps', '50'
    ]
    env = os.environ.copy()
    env['TORCH_HOME'] = str(TORCH_HOME)
    print(f'Running fold {fold}:', ' '.join(cmd))
    subprocess.check_call(cmd, env=env, cwd=REPO_DIR)

print('Helper ready: run_fold(fold) to launch training')



TORCH_HOME: /kaggle/pretrained_models
Helper ready: run_fold(fold) to launch training


In [9]:
# Kick off full 10-fold: each fold is held out as test once
folds_to_run = list(range(6, 11))
for f in folds_to_run:
    run_fold(f)

# Aggregate best validation accuracy per fold (col 0 of result.csv)
fold_acc = []
for f in folds_to_run:
    res_path = REPO_DIR / 'urban8k_exp' / f'fold{f}' / 'result.csv'
    arr = np.loadtxt(res_path, delimiter=',')
    if arr.ndim == 1:
        arr = arr[None, :]
    best_acc = float(arr[:, 0].max())
    fold_acc.append(best_acc)
    print(f'Fold {f}: best val acc {best_acc:.4f}')

mean_acc = float(np.mean(fold_acc))
std_acc = float(np.std(fold_acc))
print(f'10-fold mean acc: {mean_acc:.4f} +/- {std_acc:.4f}')


Running fold 6: /usr/bin/python3 src/run.py --model ast --dataset urban8k --data-train /kaggle/working/ast/urban8k_data/datafiles/urban_train_data_6.json --data-val /kaggle/working/ast/urban8k_data/datafiles/urban_eval_data_6.json --exp-dir /kaggle/working/ast/urban8k_exp/fold6 --label-csv /kaggle/working/ast/urban8k_data/urban8k_class_labels_indices.csv --n_class 10 --lr 1e-05 --n-epochs 3 --batch-size 12 --save_model False --freqm 24 --timem 96 --mixup 0 --bal none --tstride 10 --fstride 10 --imagenet_pretrain True --audioset_pretrain True --metrics acc --loss CE --warmup False --lrscheduler_start 5 --lrscheduler_step 1 --lrscheduler_decay 0.85 --dataset_mean -1.7362523965764032 --dataset_std 3.2758855893221015 --audio_length 1024 --noise False --num-workers 4 --n-print-steps 50


  @autocast()
  scaler = GradScaler()


I am process 88, running on f48cdd07b02b: starting (Fri Dec 12 13:10:43 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold6
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.734

  with autocast():


Epoch: [1][50/660]	Per Sample Total Time 0.06789	Per Sample Data Time 0.00156	Per Sample DNN Time 0.06633	Train Loss 1.4389	
Epoch: [1][100/660]	Per Sample Total Time 0.06652	Per Sample Data Time 0.00080	Per Sample DNN Time 0.06572	Train Loss 1.0084	
Epoch: [1][150/660]	Per Sample Total Time 0.06725	Per Sample Data Time 0.00055	Per Sample DNN Time 0.06670	Train Loss 0.8044	
Epoch: [1][200/660]	Per Sample Total Time 0.06868	Per Sample Data Time 0.00042	Per Sample DNN Time 0.06826	Train Loss 0.6665	
Epoch: [1][250/660]	Per Sample Total Time 0.06913	Per Sample Data Time 0.00034	Per Sample DNN Time 0.06878	Train Loss 0.5911	
Epoch: [1][300/660]	Per Sample Total Time 0.06960	Per Sample Data Time 0.00029	Per Sample DNN Time 0.06931	Train Loss 0.5315	
Epoch: [1][350/660]	Per Sample Total Time 0.06982	Per Sample Data Time 0.00025	Per Sample DNN Time 0.06957	Train Loss 0.4796	
Epoch: [1][400/660]	Per Sample Total Time 0.07006	Per Sample Data Time 0.00023	Per Sample DNN Time 0.06983	Train Loss 0

  with autocast():


Epoch: [2][40/660]	Per Sample Total Time 0.07283	Per Sample Data Time 0.00138	Per Sample DNN Time 0.07145	Train Loss 0.0678	
Epoch: [2][90/660]	Per Sample Total Time 0.07175	Per Sample Data Time 0.00064	Per Sample DNN Time 0.07112	Train Loss 0.0656	
Epoch: [2][140/660]	Per Sample Total Time 0.07165	Per Sample Data Time 0.00042	Per Sample DNN Time 0.07122	Train Loss 0.0707	
Epoch: [2][190/660]	Per Sample Total Time 0.07156	Per Sample Data Time 0.00032	Per Sample DNN Time 0.07124	Train Loss 0.0821	
Epoch: [2][240/660]	Per Sample Total Time 0.07143	Per Sample Data Time 0.00026	Per Sample DNN Time 0.07117	Train Loss 0.0802	
Epoch: [2][290/660]	Per Sample Total Time 0.07138	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07116	Train Loss 0.0778	
Epoch: [2][340/660]	Per Sample Total Time 0.07134	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07115	Train Loss 0.0753	
Epoch: [2][390/660]	Per Sample Total Time 0.07131	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07114	Train Loss 0.

  with autocast():


Epoch: [3][30/660]	Per Sample Total Time 0.07343	Per Sample Data Time 0.00175	Per Sample DNN Time 0.07168	Train Loss 0.0442	
Epoch: [3][80/660]	Per Sample Total Time 0.07184	Per Sample Data Time 0.00069	Per Sample DNN Time 0.07115	Train Loss 0.0409	
Epoch: [3][130/660]	Per Sample Total Time 0.07136	Per Sample Data Time 0.00044	Per Sample DNN Time 0.07092	Train Loss 0.0442	
Epoch: [3][180/660]	Per Sample Total Time 0.07121	Per Sample Data Time 0.00033	Per Sample DNN Time 0.07088	Train Loss 0.0455	
Epoch: [3][230/660]	Per Sample Total Time 0.07119	Per Sample Data Time 0.00026	Per Sample DNN Time 0.07093	Train Loss 0.0446	
Epoch: [3][280/660]	Per Sample Total Time 0.07113	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07090	Train Loss 0.0464	
Epoch: [3][330/660]	Per Sample Total Time 0.07109	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07090	Train Loss 0.0475	
Epoch: [3][380/660]	Per Sample Total Time 0.07106	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07089	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 4387, running on f48cdd07b02b: starting (Fri Dec 12 13:40:16 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold7
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.7

  with autocast():


Epoch: [1][50/658]	Per Sample Total Time 0.07709	Per Sample Data Time 0.00125	Per Sample DNN Time 0.07584	Train Loss 1.2991	
Epoch: [1][100/658]	Per Sample Total Time 0.07430	Per Sample Data Time 0.00065	Per Sample DNN Time 0.07365	Train Loss 0.9047	
Epoch: [1][150/658]	Per Sample Total Time 0.07359	Per Sample Data Time 0.00044	Per Sample DNN Time 0.07315	Train Loss 0.7162	
Epoch: [1][200/658]	Per Sample Total Time 0.07307	Per Sample Data Time 0.00034	Per Sample DNN Time 0.07273	Train Loss 0.6075	
Epoch: [1][250/658]	Per Sample Total Time 0.07287	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07259	Train Loss 0.5339	
Epoch: [1][300/658]	Per Sample Total Time 0.07270	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07246	Train Loss 0.4847	
Epoch: [1][350/658]	Per Sample Total Time 0.07258	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07237	Train Loss 0.4431	
Epoch: [1][400/658]	Per Sample Total Time 0.07250	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07231	Train Loss 0

  with autocast():


Epoch: [2][42/658]	Per Sample Total Time 0.07215	Per Sample Data Time 0.00124	Per Sample DNN Time 0.07091	Train Loss 0.0738	
Epoch: [2][92/658]	Per Sample Total Time 0.07207	Per Sample Data Time 0.00059	Per Sample DNN Time 0.07148	Train Loss 0.0687	
Epoch: [2][142/658]	Per Sample Total Time 0.07164	Per Sample Data Time 0.00040	Per Sample DNN Time 0.07125	Train Loss 0.0800	
Epoch: [2][192/658]	Per Sample Total Time 0.07152	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07122	Train Loss 0.0773	
Epoch: [2][242/658]	Per Sample Total Time 0.07139	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07115	Train Loss 0.0713	
Epoch: [2][292/658]	Per Sample Total Time 0.07131	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07110	Train Loss 0.0725	
Epoch: [2][342/658]	Per Sample Total Time 0.07129	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07110	Train Loss 0.0710	
Epoch: [2][392/658]	Per Sample Total Time 0.07125	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07109	Train Loss 0.

  with autocast():


Epoch: [3][34/658]	Per Sample Total Time 0.07296	Per Sample Data Time 0.00168	Per Sample DNN Time 0.07128	Train Loss 0.0411	
Epoch: [3][84/658]	Per Sample Total Time 0.07150	Per Sample Data Time 0.00071	Per Sample DNN Time 0.07078	Train Loss 0.0363	
Epoch: [3][134/658]	Per Sample Total Time 0.07103	Per Sample Data Time 0.00046	Per Sample DNN Time 0.07057	Train Loss 0.0302	
Epoch: [3][184/658]	Per Sample Total Time 0.07092	Per Sample Data Time 0.00034	Per Sample DNN Time 0.07058	Train Loss 0.0310	
Epoch: [3][234/658]	Per Sample Total Time 0.07092	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07064	Train Loss 0.0370	
Epoch: [3][284/658]	Per Sample Total Time 0.07090	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07066	Train Loss 0.0373	
Epoch: [3][334/658]	Per Sample Total Time 0.07093	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07072	Train Loss 0.0470	
Epoch: [3][384/658]	Per Sample Total Time 0.07091	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07073	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 8675, running on f48cdd07b02b: starting (Fri Dec 12 14:09:50 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold8
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.7

  with autocast():


Epoch: [1][50/661]	Per Sample Total Time 0.07654	Per Sample Data Time 0.00111	Per Sample DNN Time 0.07542	Train Loss 1.3134	
Epoch: [1][100/661]	Per Sample Total Time 0.07405	Per Sample Data Time 0.00058	Per Sample DNN Time 0.07347	Train Loss 0.9137	
Epoch: [1][150/661]	Per Sample Total Time 0.07340	Per Sample Data Time 0.00040	Per Sample DNN Time 0.07300	Train Loss 0.7190	
Epoch: [1][200/661]	Per Sample Total Time 0.07293	Per Sample Data Time 0.00031	Per Sample DNN Time 0.07262	Train Loss 0.6068	
Epoch: [1][250/661]	Per Sample Total Time 0.07277	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07251	Train Loss 0.5381	
Epoch: [1][300/661]	Per Sample Total Time 0.07262	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07240	Train Loss 0.4783	
Epoch: [1][350/661]	Per Sample Total Time 0.07252	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07233	Train Loss 0.4356	
Epoch: [1][400/661]	Per Sample Total Time 0.07244	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07227	Train Loss 0

  with autocast():


Epoch: [2][39/661]	Per Sample Total Time 0.07276	Per Sample Data Time 0.00142	Per Sample DNN Time 0.07135	Train Loss 0.1116	
Epoch: [2][89/661]	Per Sample Total Time 0.07174	Per Sample Data Time 0.00065	Per Sample DNN Time 0.07109	Train Loss 0.1009	
Epoch: [2][139/661]	Per Sample Total Time 0.07145	Per Sample Data Time 0.00043	Per Sample DNN Time 0.07102	Train Loss 0.0941	
Epoch: [2][189/661]	Per Sample Total Time 0.07141	Per Sample Data Time 0.00032	Per Sample DNN Time 0.07109	Train Loss 0.0886	
Epoch: [2][239/661]	Per Sample Total Time 0.07133	Per Sample Data Time 0.00026	Per Sample DNN Time 0.07106	Train Loss 0.0936	
Epoch: [2][289/661]	Per Sample Total Time 0.07127	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07104	Train Loss 0.0916	
Epoch: [2][339/661]	Per Sample Total Time 0.07121	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07102	Train Loss 0.0880	
Epoch: [2][389/661]	Per Sample Total Time 0.07122	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07104	Train Loss 0.

  with autocast():


Epoch: [3][28/661]	Per Sample Total Time 0.07298	Per Sample Data Time 0.00210	Per Sample DNN Time 0.07088	Train Loss 0.0729	
Epoch: [3][78/661]	Per Sample Total Time 0.07147	Per Sample Data Time 0.00079	Per Sample DNN Time 0.07068	Train Loss 0.0651	
Epoch: [3][128/661]	Per Sample Total Time 0.07122	Per Sample Data Time 0.00050	Per Sample DNN Time 0.07072	Train Loss 0.0664	
Epoch: [3][178/661]	Per Sample Total Time 0.07112	Per Sample Data Time 0.00037	Per Sample DNN Time 0.07075	Train Loss 0.0615	
Epoch: [3][228/661]	Per Sample Total Time 0.07104	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07074	Train Loss 0.0582	
Epoch: [3][278/661]	Per Sample Total Time 0.07100	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07075	Train Loss 0.0555	
Epoch: [3][328/661]	Per Sample Total Time 0.07099	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07077	Train Loss 0.0532	
Epoch: [3][378/661]	Per Sample Total Time 0.07096	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07077	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 12975, running on f48cdd07b02b: starting (Fri Dec 12 14:39:29 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold9
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87.

  with autocast():


Epoch: [1][50/660]	Per Sample Total Time 0.07653	Per Sample Data Time 0.00106	Per Sample DNN Time 0.07547	Train Loss 1.2994	
Epoch: [1][100/660]	Per Sample Total Time 0.07393	Per Sample Data Time 0.00055	Per Sample DNN Time 0.07338	Train Loss 0.9321	
Epoch: [1][150/660]	Per Sample Total Time 0.07330	Per Sample Data Time 0.00038	Per Sample DNN Time 0.07292	Train Loss 0.7524	
Epoch: [1][200/660]	Per Sample Total Time 0.07281	Per Sample Data Time 0.00029	Per Sample DNN Time 0.07251	Train Loss 0.6388	
Epoch: [1][250/660]	Per Sample Total Time 0.07256	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07232	Train Loss 0.5674	
Epoch: [1][300/660]	Per Sample Total Time 0.07240	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07219	Train Loss 0.5121	
Epoch: [1][350/660]	Per Sample Total Time 0.07229	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07211	Train Loss 0.4736	
Epoch: [1][400/660]	Per Sample Total Time 0.07218	Per Sample Data Time 0.00016	Per Sample DNN Time 0.07202	Train Loss 0

  with autocast():


Epoch: [2][40/660]	Per Sample Total Time 0.07295	Per Sample Data Time 0.00141	Per Sample DNN Time 0.07155	Train Loss 0.0925	
Epoch: [2][90/660]	Per Sample Total Time 0.07152	Per Sample Data Time 0.00065	Per Sample DNN Time 0.07087	Train Loss 0.0836	
Epoch: [2][140/660]	Per Sample Total Time 0.07146	Per Sample Data Time 0.00043	Per Sample DNN Time 0.07103	Train Loss 0.0748	
Epoch: [2][190/660]	Per Sample Total Time 0.07130	Per Sample Data Time 0.00033	Per Sample DNN Time 0.07097	Train Loss 0.0742	
Epoch: [2][240/660]	Per Sample Total Time 0.07129	Per Sample Data Time 0.00027	Per Sample DNN Time 0.07103	Train Loss 0.0776	
Epoch: [2][290/660]	Per Sample Total Time 0.07121	Per Sample Data Time 0.00023	Per Sample DNN Time 0.07098	Train Loss 0.0810	
Epoch: [2][340/660]	Per Sample Total Time 0.07116	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07097	Train Loss 0.0805	
Epoch: [2][390/660]	Per Sample Total Time 0.07116	Per Sample Data Time 0.00018	Per Sample DNN Time 0.07098	Train Loss 0.

  with autocast():


Epoch: [3][30/660]	Per Sample Total Time 0.07279	Per Sample Data Time 0.00172	Per Sample DNN Time 0.07107	Train Loss 0.0659	
Epoch: [3][80/660]	Per Sample Total Time 0.07179	Per Sample Data Time 0.00068	Per Sample DNN Time 0.07111	Train Loss 0.0529	
Epoch: [3][130/660]	Per Sample Total Time 0.07124	Per Sample Data Time 0.00043	Per Sample DNN Time 0.07081	Train Loss 0.0419	
Epoch: [3][180/660]	Per Sample Total Time 0.07112	Per Sample Data Time 0.00032	Per Sample DNN Time 0.07080	Train Loss 0.0397	
Epoch: [3][230/660]	Per Sample Total Time 0.07099	Per Sample Data Time 0.00026	Per Sample DNN Time 0.07073	Train Loss 0.0358	
Epoch: [3][280/660]	Per Sample Total Time 0.07100	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07078	Train Loss 0.0416	
Epoch: [3][330/660]	Per Sample Total Time 0.07098	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07079	Train Loss 0.0461	
Epoch: [3][380/660]	Per Sample Total Time 0.07096	Per Sample Data Time 0.00017	Per Sample DNN Time 0.07079	Train Loss 0.

  @autocast()
  scaler = GradScaler()


I am process 17269, running on f48cdd07b02b: starting (Fri Dec 12 15:09:04 2025)
now train a audio spectrogram transformer model
balanced sampler is not used
---------------the train dataloader---------------
now using following mask: 24 freq, 96 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process urban8k
use dataset mean -1.736 and std 3.276 to normalize the input.
number of classes is 10
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=1212

Creating experiment directory: /kaggle/working/ast/urban8k_exp/fold10
Now starting training for 3 epochs
running on cuda
Total parameter number is : 87.734 million
Total trainable parameter number is : 87

  with autocast():


Epoch: [1][50/658]	Per Sample Total Time 0.07674	Per Sample Data Time 0.00123	Per Sample DNN Time 0.07551	Train Loss 1.3386	
Epoch: [1][100/658]	Per Sample Total Time 0.07406	Per Sample Data Time 0.00064	Per Sample DNN Time 0.07342	Train Loss 0.9493	
Epoch: [1][150/658]	Per Sample Total Time 0.07342	Per Sample Data Time 0.00044	Per Sample DNN Time 0.07298	Train Loss 0.7352	
Epoch: [1][200/658]	Per Sample Total Time 0.07299	Per Sample Data Time 0.00034	Per Sample DNN Time 0.07265	Train Loss 0.6336	
Epoch: [1][250/658]	Per Sample Total Time 0.07277	Per Sample Data Time 0.00028	Per Sample DNN Time 0.07250	Train Loss 0.5685	
Epoch: [1][300/658]	Per Sample Total Time 0.07262	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07238	Train Loss 0.5083	
Epoch: [1][350/658]	Per Sample Total Time 0.07247	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07226	Train Loss 0.4626	
Epoch: [1][400/658]	Per Sample Total Time 0.07232	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07214	Train Loss 0

  with autocast():


Epoch: [2][42/658]	Per Sample Total Time 0.07278	Per Sample Data Time 0.00147	Per Sample DNN Time 0.07131	Train Loss 0.0842	
Epoch: [2][92/658]	Per Sample Total Time 0.07172	Per Sample Data Time 0.00070	Per Sample DNN Time 0.07102	Train Loss 0.0799	
Epoch: [2][142/658]	Per Sample Total Time 0.07158	Per Sample Data Time 0.00047	Per Sample DNN Time 0.07111	Train Loss 0.0879	
Epoch: [2][192/658]	Per Sample Total Time 0.07145	Per Sample Data Time 0.00035	Per Sample DNN Time 0.07109	Train Loss 0.0878	
Epoch: [2][242/658]	Per Sample Total Time 0.07137	Per Sample Data Time 0.00029	Per Sample DNN Time 0.07108	Train Loss 0.0872	
Epoch: [2][292/658]	Per Sample Total Time 0.07128	Per Sample Data Time 0.00024	Per Sample DNN Time 0.07104	Train Loss 0.0823	
Epoch: [2][342/658]	Per Sample Total Time 0.07126	Per Sample Data Time 0.00021	Per Sample DNN Time 0.07105	Train Loss 0.0765	
Epoch: [2][392/658]	Per Sample Total Time 0.07121	Per Sample Data Time 0.00019	Per Sample DNN Time 0.07102	Train Loss 0.

  with autocast():


Epoch: [3][34/658]	Per Sample Total Time 0.07311	Per Sample Data Time 0.00184	Per Sample DNN Time 0.07128	Train Loss 0.0389	
Epoch: [3][84/658]	Per Sample Total Time 0.07157	Per Sample Data Time 0.00078	Per Sample DNN Time 0.07079	Train Loss 0.0461	
Epoch: [3][134/658]	Per Sample Total Time 0.07112	Per Sample Data Time 0.00050	Per Sample DNN Time 0.07062	Train Loss 0.0469	
Epoch: [3][184/658]	Per Sample Total Time 0.07095	Per Sample Data Time 0.00037	Per Sample DNN Time 0.07058	Train Loss 0.0475	
Epoch: [3][234/658]	Per Sample Total Time 0.07086	Per Sample Data Time 0.00030	Per Sample DNN Time 0.07056	Train Loss 0.0499	
Epoch: [3][284/658]	Per Sample Total Time 0.07083	Per Sample Data Time 0.00025	Per Sample DNN Time 0.07058	Train Loss 0.0514	
Epoch: [3][334/658]	Per Sample Total Time 0.07080	Per Sample Data Time 0.00022	Per Sample DNN Time 0.07058	Train Loss 0.0535	
Epoch: [3][384/658]	Per Sample Total Time 0.07076	Per Sample Data Time 0.00020	Per Sample DNN Time 0.07057	Train Loss 0.