In [173]:
import pandas as pd
import gc
import re
import numpy as np
import torch
from imblearn.over_sampling import RandomOverSampler
import datasets
import transformers
print(transformers.__version__)
from pathlib import Path
import torchaudio

import warnings 
warnings.filterwarnings("ignore")

from tqdm import tqdm
tqdm.pandas()

4.46.3


In [174]:
RATE_HZ = 16000 # resampling rate in Hz
MAX_LENGTH = 160000 # maximum audio interval length to consider (= RATE_HZ * SECONDS)
AUDIO_FOLDER = Path("fma_test") 

In [175]:
import json
lableid_file = 'fma_metadata\mappings.json'

# Read mappings back from the JSON file
with open(lableid_file, 'r') as f:
    mappings = json.load(f)
    label2id = mappings['label2id']
    id2label = mappings['id2label']

# Print loaded mappings to confirm
print("Loaded label2id:", label2id)
print("Loaded id2label:", id2label)
print(len(label2id))

Loaded label2id: {'Avant-Garde': 1, 'International': 2, 'Blues': 3, 'Jazz': 4, 'Classical': 5, 'Novelty': 6, 'Comedy': 7, 'Old-Time / Historic': 8, 'Country': 9, 'Pop': 10, 'Disco': 11, 'Rock': 12, 'Easy Listening': 13, 'Soul-RnB': 14, 'Electronic': 15, 'Sound Effects': 16, 'Folk': 17, 'Soundtrack': 18, 'Funk': 19, 'Spoken': 20, 'Hip-Hop': 21, 'Audio Collage': 22, 'Punk': 25, 'Post-Rock': 26, 'Lo-Fi': 27, 'Field Recordings': 30, 'Metal': 31, 'Noise': 32, 'Psych-Folk': 33, 'Krautrock': 36, 'Jazz: Vocal': 37, 'Experimental': 38, 'Electroacoustic': 41, 'Ambient Electronic': 42, 'Radio Art': 43, 'Loud-Rock': 45, 'Latin America': 46, 'Drone': 47, 'Free-Folk': 49, 'Noise-Rock': 53, 'Psych-Rock': 58, 'Bluegrass': 63, 'Electro-Punk': 64, 'Radio': 65, 'Indie-Rock': 66, 'Industrial': 70, 'No Wave': 71, 'Free-Jazz': 74, 'Experimental Pop': 76, 'French': 77, 'Reggae - Dub': 79, 'Afrobeat': 81, 'Nerdcore': 83, 'Garage': 85, 'Indian': 86, 'New Wave': 88, 'Post-Punk': 89, 'Sludge': 90, 'African': 92,

In [176]:
from pathlib import Path
import torchaudio
with open('fma_metadata\\track_genres.json', 'r') as f:
    data = json.load(f)
df = [{"file": track_id, "label": genre_id} for track_id, genre_id in data.items()]
dd = pd.DataFrame(df)
print(dd.head())
print(dd.shape)
print(dd['label'].value_counts())

  file  label
0    2   21.0
1    3   21.0
2    5   21.0
3   10   10.0
4   20   76.0
(106574, 2)
label
15.0     20325
1.0       8693
38.0      6697
12.0      6639
10.0      5910
         ...  
444.0        2
810.0        2
502.0        2
173.0        1
170.0        1
Name: count, Length: 148, dtype: int64


In [177]:
def get_transform_audio(file):
    audio,rate = torchaudio.load(str(file))
    transform = torchaudio.transforms.Resample(rate,RATE_HZ)
    if audio.size(0) > 1:  # 如果是多通道
        audio = audio.mean(dim=0)
    audio = transform(audio).numpy()
    audio = audio[:MAX_LENGTH]
    return audio # truncate to first part of audio to save RAM

In [178]:
audio_data = []
labels = []

for _, row in dd.iterrows():
    # 补齐文件名到6位
    track_id = row["file"].zfill(6)
    label = row["label"]
    
    # 拼接文件路径
    audio_path = AUDIO_FOLDER / track_id[:3] / f"{track_id}.mp3"
    
    if audio_path.exists():
        try:
            # 读取并处理音频
            audio = get_transform_audio(audio_path)
            audio_data.append(audio)
            labels.append(label)
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
    else:
        print(f"Audio file not found: {audio_path}")

# 检查结果
print("Processed audio files:", len(audio_data))
print("Processed labels:", len(labels))

Audio file not found: fma_test\000\000003.mp3
Audio file not found: fma_test\000\000020.mp3
Audio file not found: fma_test\000\000026.mp3
Audio file not found: fma_test\000\000030.mp3
Audio file not found: fma_test\000\000046.mp3
Audio file not found: fma_test\000\000048.mp3
Audio file not found: fma_test\000\000134.mp3
Audio file not found: fma_test\000\000135.mp3
Audio file not found: fma_test\000\000136.mp3
Audio file not found: fma_test\000\000137.mp3
Audio file not found: fma_test\000\000138.mp3
Audio file not found: fma_test\000\000139.mp3
Audio file not found: fma_test\000\000142.mp3
Audio file not found: fma_test\000\000144.mp3
Audio file not found: fma_test\000\000145.mp3
Audio file not found: fma_test\000\000146.mp3
Audio file not found: fma_test\000\000147.mp3
Audio file not found: fma_test\000\000149.mp3
Audio file not found: fma_test\000\000150.mp3
Audio file not found: fma_test\000\000151.mp3
Audio file not found: fma_test\000\000152.mp3
Audio file not found: fma_test\000

In [179]:
print(type(labels))
CLASS_NUM=len(np.unique(labels))
print(CLASS_NUM)
labels = torch.tensor(labels, dtype=torch.long)
print("Tensor dtype:", labels.dtype)

totdd = pd.DataFrame({
    "label": labels,
    "audio": audio_data
})

# 检查结果
print(totdd.sample(5)) 
totdd["audio"] = totdd["audio"].apply(lambda x: np.array(x, dtype=np.float32))
print(totdd.sample(5)) 
print(totdd['label'].dtype)

<class 'list'>
33
Tensor dtype: torch.int64
     label                                              audio
155     12  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6       27  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
164     12  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
60      32  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
113     38  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
     label                                              audio
97       2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
73      17  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
80      22  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
155     12  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
104      2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
int64


In [180]:
lengths = totdd["audio"].apply(len)
print(lengths.describe())

# 检查是否存在极端长的音频
print(lengths.nlargest(5))

count       251.000000
mean     158087.661355
std       17421.954332
min           1.000000
25%      160000.000000
50%      160000.000000
75%      160000.000000
max      160000.000000
Name: audio, dtype: float64
0    160000
1    160000
2    160000
3    160000
4    160000
Name: audio, dtype: int64


In [181]:
print(totdd["audio"].apply(len).nunique()) 
totdd = totdd[totdd["audio"].apply(len) >= 16000]
print(totdd["audio"].apply(len).nunique()) 

2
1


In [182]:
from datasets import Dataset, ClassLabel
totdd = Dataset.from_pandas(totdd)

from collections import Counter
Counter(totdd['label']).items()


dict_items([(21, 9), (10, 10), (17, 65), (1, 4), (27, 1), (12, 38), (31, 1), (89, 1), (36, 1), (25, 2), (41, 1), (46, 22), (2, 12), (49, 2), (32, 8), (53, 7), (22, 7), (47, 1), (15, 6), (33, 5), (30, 1), (58, 1), (38, 5), (98, 1), (76, 7), (118, 2), (103, 1), (117, 13), (92, 6), (77, 5), (102, 3)])

In [183]:
print(type(totdd['label'][0]))
totdd = totdd.map(lambda x: {"label": torch.tensor(x["label"], dtype=torch.long)}, batched=True)
print(type(totdd['label'][0]))

<class 'int'>


Map:   0%|          | 0/248 [00:00<?, ? examples/s]

<class 'int'>


In [184]:
totdd = totdd.train_test_split(test_size=0.2)

In [185]:
print(totdd)
totdd['train'] = totdd['train'].remove_columns(['__index_level_0__'])
totdd['test'] = totdd['test'].remove_columns(['__index_level_0__'])
totdd

DatasetDict({
    train: Dataset({
        features: ['label', 'audio', '__index_level_0__'],
        num_rows: 198
    })
    test: Dataset({
        features: ['label', 'audio', '__index_level_0__'],
        num_rows: 50
    })
})


DatasetDict({
    train: Dataset({
        features: ['label', 'audio'],
        num_rows: 198
    })
    test: Dataset({
        features: ['label', 'audio'],
        num_rows: 50
    })
})

In [186]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

model_str = "MIT/ast-finetuned-audioset-10-10-0.4593" 
feature_extractor = AutoFeatureExtractor.from_pretrained(model_str)
model = AutoModelForAudioClassification.from_pretrained(model_str,num_labels=len(label2id),ignore_mismatched_sizes=True)
model.config.id2label = id2label
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)
print(len(label2id))
print(len(np.unique(labels)))

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([163]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([163, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


86.314147
163
33


In [187]:
def preprocess_function(batch):    
    inputs = feature_extractor(
        batch['audio'], 
        sampling_rate=RATE_HZ, 
        max_length=MAX_LENGTH, 
        truncation=True
    )
    inputs['input_values'] = inputs['input_values'][0]
    
    # Convert label to torch.long
    if 'label' in batch:
        inputs['labels'] = torch.tensor(batch['label'], dtype=torch.long)
    
    return inputs

totdd['test'] = totdd['test'].map(
    preprocess_function, 
    remove_columns=totdd['test'].column_names,
    batched=False
)

totdd['train'] = totdd['train'].map(
    preprocess_function, 
    remove_columns=totdd['train'].column_names,
    batched=False
)
totdd['train'].set_format(type='torch', columns=['input_values', 'labels'])
totdd['test'].set_format(type='torch', columns=['input_values', 'labels'])

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

In [188]:
print("Training data label type:", type(totdd['train'][0]['labels']))
print("Training data label dtype:", totdd['train'][0]['labels'].dtype)
print("Test data label type:", type(totdd['test'][0]['labels']))
print("Test data label dtype:", totdd['test'][0]['labels'].dtype)

Training data label type: <class 'torch.Tensor'>
Training data label dtype: torch.int64
Test data label type: <class 'torch.Tensor'>
Test data label dtype: torch.int64


In [189]:
gc.collect()

5969

In [190]:
import evaluate
from sklearn.preprocessing import label_binarize

accuracy = evaluate.load("accuracy")

from sklearn.metrics import roc_auc_score
def compute_metrics(eval_pred):
    predictions = eval_pred.predictions  # shape: (n_samples, 163)
    label_ids = eval_pred.label_ids     # shape: (n_samples,)
    
    # 获取当前数据集中实际出现的类别
    present_classes = np.unique(label_ids)
    print(f"Present classes in current dataset: {present_classes}")
    print(f"Number of present classes: {len(present_classes)}")
    print(f"Shape of predictions: {predictions.shape}")
    
    # 应用softmax得到概率
    predictions = np.exp(predictions)/np.exp(predictions).sum(axis=1, keepdims=True)
    
    # 计算准确率（这个不需要修改，因为argmax会自动找到最大概率的类别）
    acc_score = accuracy.compute(
        predictions=predictions.argmax(axis=1),
        references=label_ids
    )['accuracy']
    
    # 对于ROC AUC，我们只考虑实际出现的类别
    # 将标签转换为二值化形式，但只针对出现的类别
    y_true_bin = label_binarize(label_ids, classes=present_classes)
    
    # 只取出对应出现类别的预测概率
    predictions_subset = predictions[:, present_classes]
    
    # 计算ROC AUC
    try:
        roc_auc = roc_auc_score(
            y_true=y_true_bin,
            y_score=predictions_subset,
            multi_class='ovr',
            average='macro'
        )
    except ValueError as e:
        print(f"ROC AUC calculation error: {e}")
        roc_auc = 0.0
    
    return {
        "roc_auc": roc_auc,
        "accuracy": acc_score,
        "present_classes": len(present_classes)
    }

In [193]:
from transformers import TrainingArguments, Trainer
batch_size=4
warmup_steps=50
weight_decay=0.02
num_train_epochs=10
model_name = "test_train_model"
training_args = TrainingArguments(
    output_dir=model_name,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=5e-5, # 5e-6
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=10,#1
    evaluation_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=20,#1
    gradient_accumulation_steps=1, 
    gradient_checkpointing=True,
    save_strategy='epoch',
    save_total_limit=1, # save fewer checkpoints to limit used space
    report_to="mlflow",  # log to mlflow

    optim="adamw_hf",  # Optimizer, in this case Adam
    adam_beta1=0.9,  # Adam optimizer beta1
    adam_beta2=0.999,  # Adam optimizer beta2
    adam_epsilon=1e-8,  # Adam optimizer epsilon
    fp16=True,
    lr_scheduler_type="linear"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=totdd["train"],
    eval_dataset=totdd["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [195]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

Present classes in current dataset: [  1   2  10  12  15  17  21  22  32  38  46  53  76  77 102 103 117]
Number of present classes: 17
Shape of predictions: (50, 163)


{'eval_loss': 5.024621486663818,
 'eval_model_preparation_time': 0.002,
 'eval_roc_auc': 0.5950987877994387,
 'eval_accuracy': 0.02,
 'eval_present_classes': 17,
 'eval_runtime': 42.6015,
 'eval_samples_per_second': 1.174,
 'eval_steps_per_second': 0.305}

In [None]:
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 4.9373, 'grad_norm': 28.655921936035156, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.02}
{'loss': 4.8465, 'grad_norm': 28.647785186767578, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.04}


KeyboardInterrupt: 

2024/11/20 18:30:52 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 813f458534ad475ba176a1d0d13d8c9b: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_steps' was already logged with value='1' for run ID='813f458534ad475ba176a1d0d13d8c9b'. Attempted logging new value '10'.


  0%|          | 0/500 [00:00<?, ?it/s]