# 安裝必要依賴

In [1]:
! pip install pydub librosa numpy scikit-learn xgboost




[notice] A new release of pip is available: 23.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# MP3 轉 WAV

In [2]:
import os
from pydub import AudioSegment

MP3_PATH = "CAT_DB"         # 原始 mp3 資料夾
WAV_PATH = "CAT_DB_WAV"     # 轉出 wav 的資料夾

os.makedirs(WAV_PATH, exist_ok=True)

for emotion in os.listdir(MP3_PATH):
    emotion_mp3_dir = os.path.join(MP3_PATH, emotion)
    emotion_wav_dir = os.path.join(WAV_PATH, emotion)
    os.makedirs(emotion_wav_dir, exist_ok=True)

    if not os.path.isdir(emotion_mp3_dir):
        continue

    for filename in os.listdir(emotion_mp3_dir):
        if filename.endswith(".mp3"):
            mp3_file = os.path.join(emotion_mp3_dir, filename)
            wav_file = os.path.join(emotion_wav_dir, filename.replace(".mp3", ".wav"))
            print(f"轉檔中: {mp3_file} → {wav_file}")

            audio = AudioSegment.from_mp3(mp3_file)
            audio.export(wav_file, format="wav")

轉檔中: CAT_DB\Angry\car_extcoll0156.mp3 → CAT_DB_WAV\Angry\car_extcoll0156.wav
轉檔中: CAT_DB\Angry\car_extcoll0162.mp3 → CAT_DB_WAV\Angry\car_extcoll0162.wav
轉檔中: CAT_DB\Angry\car_extcoll0169.mp3 → CAT_DB_WAV\Angry\car_extcoll0169.wav
轉檔中: CAT_DB\Angry\car_extcoll0171.mp3 → CAT_DB_WAV\Angry\car_extcoll0171.wav
轉檔中: CAT_DB\Angry\car_extcoll0172.mp3 → CAT_DB_WAV\Angry\car_extcoll0172.wav
轉檔中: CAT_DB\Angry\car_extcoll0174.mp3 → CAT_DB_WAV\Angry\car_extcoll0174.wav
轉檔中: CAT_DB\Angry\car_extcoll0175.mp3 → CAT_DB_WAV\Angry\car_extcoll0175.wav
轉檔中: CAT_DB\Angry\car_extcoll0177.mp3 → CAT_DB_WAV\Angry\car_extcoll0177.wav
轉檔中: CAT_DB\Angry\car_extcoll0191.mp3 → CAT_DB_WAV\Angry\car_extcoll0191.wav
轉檔中: CAT_DB\Angry\car_extcoll0212.mp3 → CAT_DB_WAV\Angry\car_extcoll0212.wav
轉檔中: CAT_DB\Defense\car_extcoll0168.mp3 → CAT_DB_WAV\Defense\car_extcoll0168.wav
轉檔中: CAT_DB\Defense\car_extcoll0180.mp3 → CAT_DB_WAV\Defense\car_extcoll0180.wav
轉檔中: CAT_DB\Defense\car_extcoll0181.mp3 → CAT_DB_WAV\Defense\car_ext

# 資料及切割和特徵擷取

In [3]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

EMOTIONS = os.listdir(WAV_PATH)

def extract_features(y, sr):
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

        features = np.hstack([
            np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(contrast, axis=1), np.std(contrast, axis=1),
            np.mean(zcr, axis=1), np.std(zcr, axis=1),
            np.mean(rmse, axis=1), np.std(rmse, axis=1),
            np.mean(tonnetz, axis=1), np.std(tonnetz, axis=1)
        ])
        return features

def add_white_noise(y, noise_factor=0.005):
        return y + noise_factor * np.random.randn(len(y))

def pitch_shift(y, sr, n_steps):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

def time_stretch(y, rate):
    return librosa.effects.time_stretch(y, rate=rate)

def change_volume(y, db):
    return y * (10.0 ** (db / 20.0))

def augment_and_extract(y_raw, sr, label):
    """
    對輸入音訊進行增強並抽取特徵，回傳特徵與標籤列表。
    """
    X_aug, y_aug = [], []

    variants = [
        y_raw,
        add_white_noise(y_raw),
        pitch_shift(y_raw, sr, 2),
        pitch_shift(y_raw, sr, -2),
        time_stretch(y_raw, 1.2),
        time_stretch(y_raw, 0.8),
        change_volume(y_raw, 5),
        change_volume(y_raw, -5)
    ]

    for aug_y in variants:
        try:
            feat = extract_features(aug_y, sr)
            X_aug.append(feat)
            y_aug.append(label)
        except Exception as e:
            print(f"[!] 增強或特徵擷取失敗（{label}）: {e}")

    return X_aug, y_aug


# 特徵與標籤蒐集
X_train, y_train, X_test, y_test = [], [], [], []

for emotion in EMOTIONS:
    emotion_dir = os.path.join(WAV_PATH, emotion)
    if not os.path.isdir(emotion_dir):
        continue

    files = os.listdir(emotion_dir)
    train_files, test_files = train_test_split(
        files, test_size=0.2, random_state=42)

    for train_file in train_files:
        path = os.path.join(emotion_dir, train_file)
        y_raw, sr = librosa.load(path, sr=None)

        X_aug, y_aug = augment_and_extract(y_raw, sr, emotion)
        X_train.extend(X_aug)
        y_train.extend(y_aug)

    for test_file in test_files:
        path = os.path.join(emotion_dir, test_file)
        y_raw, sr = librosa.load(path, sr=None)

        X_aug, y_aug = augment_and_extract(y_raw, sr, emotion)
        X_test.extend(X_aug)
        y_test.extend(y_aug)


le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)



In [4]:
import xgboost as xgb
from sklearn.metrics import classification_report

# 建立 DMatrix（XGBoost 自有格式）
dtrain = xgb.DMatrix(X_train, label=y_train_enc)
dtest = xgb.DMatrix(X_test, label=y_test_enc)

params = {
    'max_depth': 8,
    'eta': 0.05,
    'objective': 'multi:softmax',
    'num_class': len(set(y_train_enc)),
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'mlogloss',
    'seed': 42,
}

evals = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(params, dtrain, num_boost_round=300, evals=evals, early_stopping_rounds=20)

y_pred = bst.predict(dtest).astype(int)
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

[0]	train-mlogloss:2.13207	eval-mlogloss:2.22355
[1]	train-mlogloss:1.98544	eval-mlogloss:2.16293
[2]	train-mlogloss:1.85467	eval-mlogloss:2.09619
[3]	train-mlogloss:1.73770	eval-mlogloss:2.04361
[4]	train-mlogloss:1.63175	eval-mlogloss:1.98491
[5]	train-mlogloss:1.53666	eval-mlogloss:1.94131
[6]	train-mlogloss:1.45092	eval-mlogloss:1.90309
[7]	train-mlogloss:1.37122	eval-mlogloss:1.86404
[8]	train-mlogloss:1.29755	eval-mlogloss:1.82823
[9]	train-mlogloss:1.22828	eval-mlogloss:1.78827
[10]	train-mlogloss:1.16671	eval-mlogloss:1.75876
[11]	train-mlogloss:1.10856	eval-mlogloss:1.72184
[12]	train-mlogloss:1.05321	eval-mlogloss:1.69479
[13]	train-mlogloss:1.00186	eval-mlogloss:1.67023
[14]	train-mlogloss:0.95356	eval-mlogloss:1.64658
[15]	train-mlogloss:0.90919	eval-mlogloss:1.62792
[16]	train-mlogloss:0.86753	eval-mlogloss:1.60337
[17]	train-mlogloss:0.82779	eval-mlogloss:1.57889
[18]	train-mlogloss:0.79085	eval-mlogloss:1.55439
[19]	train-mlogloss:0.75495	eval-mlogloss:1.53747
[20]	train

# 模型保存

In [5]:
bst.save_model('model.json')

# 測試 youtube 抓的貓叫聲

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

bst = xgb.Booster()
bst.load_model('model.json')  # 或 'model.xgb'

test_files = ["happy_cat_1.wav", "happy_cat_2.wav","angry_cat_1.wav","angry_cat_2.wav","fighting_cat_1.wav","fighting_cat_2.wav","motherCall_cat_1.wav","motherCall_cat_2.wav"]  # 測試檔案列表
for file in test_files:
    path = os.path.join("test_wav", file)  # 假設測試檔案在 "happy" 資料夾下
    y_raw, sr = librosa.load(path, sr=None)
    x = extract_features(y_raw, sr)  # 使用之前定義的特徵提取函數
    dtest = xgb.DMatrix([x])  # 將特徵轉換為 DMatrix 格式
    pred = bst.predict(dtest)
    print(f"檔案 {file} 的預測情緒: {le.inverse_transform([int(pred[0])])[0]}")


檔案 happy_cat_1.wav 的預測情緒: Paining
檔案 happy_cat_2.wav 的預測情緒: Paining
檔案 angry_cat_1.wav 的預測情緒: Mating
檔案 angry_cat_2.wav 的預測情緒: Paining
檔案 fighting_cat_1.wav 的預測情緒: Fighting
檔案 fighting_cat_2.wav 的預測情緒: Fighting
檔案 motherCall_cat_1.wav 的預測情緒: Happy
檔案 motherCall_cat_2.wav 的預測情緒: Happy
