In [None]:
import pandas as pd

In [None]:
path = "/home/tuyendv/Desktop/speech_emotion_recognition_v1/datas/tth_label.txt"

data_df = pd.read_csv(path, sep="|", names=["id", "text", "label"], dtype={"id":str, "text":str, "label":str})

In [None]:
data_df.head()

In [None]:
data_df["length"] = data_df.text.apply(lambda x: len(x.split()))

In [None]:
news_data = []
for name, group in data_df.sort_values(by="length", ascending=False).groupby("label"):
    news_data.append(group.iloc[0:475])

In [None]:
pd.concat(news_data)[["id", "text", "label"]].to_csv("tth_label.txt", sep="|", columns=None, index=None)

In [None]:
import librosa
import numpy as np
from pydub import AudioSegment
import soundfile as sf
from src.utils import padding_audio

In [None]:
class Data_Preprocessor():
    def __init__(self, data_config, general_config):
        self.data_config = data_config
        self.general_config = general_config
        
        self.max_signal_length = general_config["general"]["max_signal_duration"] * general_config["general"]["sample_rate"]
        self.origin_sample_rate = data_config["sample_rate"]
        self.sample_rate = general_config["general"]["sample_rate"]
        
    def preprocess(self, path):
        wav, _ = librosa.load(path, sr=self.origin_sample_rate)
        
        wav = librosa.resample(wav, orig_sr=self.origin_sample_rate, 
                               target_sr=self.sample_rate)
        wav = padding_audio(wav, max_signal_length=self.max_signal_length)
        
        return wav

In [None]:
def extract_mfcc(wav, config):
    mfcc = librosa.feature.mfcc(
        y=wav, 
        sr=int(config["sample_rate"]),
        hop_length=int(config["hop_length"]),
        win_length=int(config["win_length"]),
        n_mfcc=int(config["n_mfcc"]),
        fmax=int(config["fmax"]), 
        fmin=int(config["fmin"]),
    )
    return mfcc

In [None]:
class Feature_Extractor():
    def __init__(self, config) -> None:
        self.config = config
    
    def extract_mfcc(self, wav):
        mfcc = librosa.feature.mfcc(
            y=wav, 
            sr=int(self.config["mfcc"]["sample_rate"]),
            hop_length=int(self.config["mfcc"]["hop_length"]),
            win_length=int(self.config["mfcc"]["win_length"]),
            n_mfcc=int(self.config["mfcc"]["n_mfcc"]),
            fmax=int(self.config["mfcc"]["fmax"]), 
            fmin=int(self.config["mfcc"]["fmin"]))
        
        return mfcc
    
    def extract_mel_spectrogram(self, wav):
        mel_spectrogram = librosa.feature.melspectrogram(
            y=wav, 
            sr=int(self.config["mel"]["sample_rate"]),
            hop_length=int(self.config["mel"]["hop_length"]),
            win_length=int(self.config["mel"]["win_length"]),
            n_mels=int(self.config["mel"]["n_mels"]),
            fmax=int(self.config["mel"]["fmax"]), 
            fmin=int(self.config["mel"]["fmin"]),)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram)
        
        return mel_spectrogram

from yaml.loader import SafeLoader
import yaml

with open("configs/general_config.yml", "r") as f:
    config = yaml.load(f, Loader=SafeLoader)
feature_extractor = Feature_Extractor(config)

In [None]:
path = "/home/tuyendv/Desktop/speech_emotion_recognition/ess_dataset/tth/wavs/000000.wav"

wav, sr = librosa.load(path, sr=22050)


In [None]:
new_wav = padding_audio(wav, 4 * sr)

In [None]:
sf.write("000000.wav", wav, 22050)

In [None]:
new_mfcc = feature_extractor.extract_mfcc(new_wav)
mfcc = feature_extractor.extract_mfcc(wav)

In [None]:
import matplotlib.pyplot as plt

librosa.display.specshow(new_mfcc, x_axis='time')
plt.colorbar()
plt.tight_layout()
plt.title('mfcc')
plt.show

In [None]:
import matplotlib.pyplot as plt

librosa.display.specshow(mfcc, x_axis='time')
plt.colorbar()
plt.tight_layout()
plt.title('mfcc')
plt.show