In [35]:

# extract features via librosa
import librosa
import numpy as np
import parselmouth
from parselmouth.praat import call


class FeatureExtractor:
    def __init__(self, sr=44100):
        self.sr = sr

    def extract_features(self, audio_path):
        y, sr = librosa.load(audio_path, sr=self.sr)
        
        # # extract mfcc
        # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

        # # extract mel
        # mel = librosa.feature.melspectrogram(y=y, sr=sr)

        # extract contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

        # extract spectral centroid
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)

        # extract spectral bandwidth
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)

        # extract spectral rolloff
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.99)
        spec_rolloff_min = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.01)

        # # extract pitch(f0) from time series
        # f0, voiced_flag, voiced_probs = librosa.pyin(y,
        #                                              fmin=librosa.note_to_hz('C2'),
        #                                              fmax=librosa.note_to_hz('C7'))
        # f0 = f0[np.newaxis, :]
        # voiced_flag = voiced_flag[np.newaxis, :]
        # voiced_probs = voiced_probs[np.newaxis, :]

    
        # extract zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(y=y)

        # extract flatness
        flatness = librosa.feature.spectral_flatness(y=y)
        
        praatsound = parselmouth.Sound(str(audio_path))
        # concatenate all features
        # features = np.concatenate((mfcc, mel, contrast, spec_cent, spec_bw, spec_rolloff, spec_rolloff_min, f0, voiced_flag, voiced_probs, zcr, flatness), axis=0)
        # # features = np.concatenate((f0, voiced_probs), axis=0)
        # # Aggregate features
        # # features = np.nan_to_num(features, nan=0.0)
        # # features[~np.isfinite(features)] = 0
        # features = np.vstack((np.mean(features, axis=1))).flatten()
        # print(features.shape)
        
        # return features
        formant = praatsound.to_formant_burg(time_step=0.01, max_number_of_formants=3, maximum_formant=5500)
    
        # 获取时间轴
        times = formant.xs()
        
        # 初始化列表
        F1 = []
        F2 = []
        F3 = []
        
        # 提取每个时间点的Formants
        for t in times:
            f1 = formant.get_value_at_time(1, t)
            f2 = formant.get_value_at_time(2, t)
            f3 = formant.get_value_at_time(3, t)
            
            # 处理未定义的Formants
            F1.append(f1 if f1 != 0 else np.nan)
            F2.append(f2 if f2 != 0 else np.nan)
            F3.append(f3 if f3 != 0 else np.nan)
        # change to numpy array
        F1, F2, F3 = np.array(F1), np.array(F2), np.array(F3)
        harmonicity = call(praatsound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        hnr_value = call(harmonicity, "Get mean", 0, 0)
        
        featurelist = ['contrast', 'spec_cent', 'spec_bw', 'spec_rolloff', 'spec_rolloff_min', 'zcr', 'flatness', 'F1', 'F2', 'F3', 'hnr_value']
        features = (contrast, spec_cent, spec_bw, spec_rolloff, spec_rolloff_min, zcr, flatness, F1, F2, F3, hnr_value)
        for feat in features:
            if type(feat) == float:
                print(feat)
            elif type(feat) == np.ndarray:
                if feat.shape[0] == 1 and len(feat.shape) == 2:
                    feat = feat[0]
                print(feat.shape)


        return features, featurelist
    
    

In [40]:
if __name__ == '__main__':
    feature_extractor = FeatureExtractor()
    # feature_extractor.extract_features('subj-2211_25_E_bulten_Egypte.wav_1.wav')
    feat = np.load('/data/storage500/Turntaking/wavs_single_channel_normalized_nosil/PictureNaming-features/hnr_value/subj-2126_rekenmachine.png_1.wav_hnr_value.npy')
    print(feat)
    print(feat.shape)

12.492227739289842
()
