In [1]:
import librosa
import numpy as np
import parselmouth

def extract_rms(y):
    """Trích xuất cường độ âm thanh (RMS)."""
    return librosa.feature.rms(y=y)

def extract_pitch(y, sr):
    """Trích xuất tần số cơ bản (Pitch)."""
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    return f0

def extract_tempo(y, sr):
    """Trích xuất tốc độ nói (Tempo)."""
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    return tempo

def extract_mfccs(y, sr, n_mfcc=13):
    """Trích xuất MFCC (Mel-frequency Cepstral Coefficients)."""
    return librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

def extract_formants(sound, time_point=0.5):
    """Trích xuất formants."""
    formants = sound.to_formant_burg()
    f1 = formants.get_value_at_time(1, time_point)
    f2 = formants.get_value_at_time(2, time_point)
    return f1, f2

def extract_hnr(sound):
    """Trích xuất Harmonic-to-Noise Ratio (HNR)."""
    harmonicity = sound.to_harmonicity_cc()
    return harmonicity.values

def extract_jitter(sound):
    """Trích xuất jitter."""
    jitter = sound.to_jitter()
    return jitter.local()

def extract_shimmer(sound):
    """Trích xuất shimmer."""
    shimmer = sound.to_shimmer()
    return shimmer.local()

def extract_energy(y):
    """Trích xuất đặc trưng năng lượng."""
    return np.sum(librosa.feature.rms(y=y)**2, axis=0)

def extract_pauses(y, sr, threshold=0.2):
    """Trích xuất thời gian tạm ngừng (Pauses)."""
    intervals = librosa.effects.split(y, top_db=20)
    pauses = [(interval[1] - interval[0]) / sr for interval in intervals if (interval[1] - interval[0]) / sr > threshold]
    return pauses

def analyze_audio(filename):
    """Phân tích toàn bộ đặc trưng từ file âm thanh."""
    y, sr = librosa.load(filename, sr=None)
    # sound = parselmouth.Sound(filename)
    
    features = {
        "RMS": extract_rms(y),
        "Pitch": extract_pitch(y, sr),
        "Tempo": extract_tempo(y, sr),
        "MFCCs": extract_mfccs(y, sr),
        # "Formants": extract_formants(sound),
        # "HNR": extract_hnr(sound),
        # "Jitter": extract_jitter(sound),
        # "Shimmer": extract_shimmer(sound),
        "Energy": extract_energy(y),
        "Pauses": extract_pauses(y, sr)
    }
    
    return features

In [2]:
# Sử dụng hàm analyze_audio để trích xuất đặc trưng từ file âm thanh
filename = r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\cleaned_data\Angry\03-01-05-01-01-01-02.wav'
features = analyze_audio(filename)

# In kết quả
for feature_name, feature_value in features.items():
    print(f"{feature_name}: {feature_value}")

RMS: [[0.00469401 0.00631595 0.00734788 0.00791246 0.00714537 0.00648255
  0.00586042 0.00553925 0.00614767 0.00996364 0.01526173 0.01824455
  0.02165338 0.02330492 0.02301964 0.02483593 0.02416617 0.02154299
  0.02002836 0.01481367 0.01030054 0.00930837 0.00606757 0.00707394
  0.00791785 0.00916368 0.01087847 0.01186182 0.01316577 0.01314981
  0.01169918 0.01604163 0.02823734 0.0326811  0.03554827 0.03439701
  0.02513035 0.01857407 0.01203219 0.00782297 0.00470785 0.00373214
  0.00339439 0.00906373 0.01037134 0.01056328 0.01047472 0.00669774
  0.00536378 0.00524655 0.00548915 0.005117   0.00440752 0.00439834
  0.00557872 0.00590888 0.01097656 0.02434115 0.03759502 0.0496772
  0.05771474 0.06003859 0.05597588 0.04689348 0.03597374 0.0236122
  0.01421986 0.00895184 0.00631323 0.00552647 0.00457383 0.00419249
  0.00398388 0.0035524  0.0064084  0.0091604  0.01157967 0.01380642
  0.01461866 0.01492562 0.01468489 0.01538001 0.01648892 0.01747195
  0.01887148 0.01829221 0.01606839 0.0141907 

In [3]:
# Sử dụng hàm analyze_audio để trích xuất đặc trưng từ file âm thanh
filename = r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\cleaned_data\Sad\03-01-04-01-01-01-09.wav'
features = analyze_audio(filename)

# In kết quả
for feature_name, feature_value in features.items():
    print(f"{feature_name}: {feature_value}")


RMS: [[0.00019039 0.0002873  0.00032018 0.00032802 0.00042261 0.00111325
  0.00177    0.00233887 0.00287165 0.00301258 0.00278999 0.00243862
  0.00188013 0.00137526 0.00136923 0.00124756 0.00115004 0.00108977
  0.00101856 0.00128527 0.00157387 0.00186298 0.00202152 0.00197201
  0.00177844 0.00144487 0.00102782 0.00069103 0.00043431 0.00037685
  0.00067342 0.00077888 0.00081181 0.00080474 0.00068481 0.00095309
  0.00122314 0.00160181 0.00202023 0.00206171 0.00213375 0.00209992
  0.00183903 0.00175425 0.00148273 0.00119365 0.00092492 0.00063789
  0.0005535  0.00037429 0.00023486 0.00019479 0.00029133 0.00030583
  0.00037569 0.00055413 0.0005638  0.00069323 0.00073035 0.00062545
  0.00060618 0.0005418  0.00051472 0.00054437 0.00058042 0.0005733
  0.00052841 0.00049712 0.0004323  0.00038262 0.00046024 0.00051467
  0.00060175 0.0006731  0.00066883 0.00066844 0.00067259 0.00060009
  0.00058158 0.00064398 0.00058208 0.00054173 0.00048478 0.00027596
  0.00017239 0.00018221 0.00029784 0.0003890

In [4]:
def extract_rms_features(y, frame_length=512, hop_length=256):
    """Trích xuất RMS energy và tính các giá trị thống kê."""
    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    rms_mean = np.mean(rms)
    rms_variance = np.var(rms)
    rms_max = np.max(rms)
    rms_min = np.min(rms)
    rms_median = np.median(rms)
    return rms_mean, rms_variance, rms_max, rms_min, rms_median

filename = r'D:\data_analysis\speech_emotion_recognition\data\EnglishDataset\cleaned_data\Sad\03-01-04-01-01-01-09.wav'
y, sr = librosa.load(filename, sr=None)
rms_mean, rms_variance, rms_max, rms_min, rms_median = extract_rms_features(y)
print(rms_mean, rms_variance, rms_max, rms_min, rms_median)

0.000749678 4.612485e-07 0.0033959572 8.946258e-06 0.00050894753
