## 

In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt

def audio_trim(wav, length, threshold=10):
    # split an audio signal into non-silent intervals.
    non_silence_parts = librosa.effects.split(wav, top_db=threshold)
    output = np.concatenate([wav[s:e] for s, e in non_silence_parts])
    output = output[1:length+1]
    return output

if __name__ == '__main__':
    wav_file_path = '../audio/wang//test_data/20170001P00235A0066.wav'

    wav, _ = librosa.load(wav_file_path, sr=44100)
    plt.plot(wav)
    plt.show()

    output_part = audio_trim(wav, 20000)
    print("shape", np.shape(output_part))
    plt.plot(output_part)
    plt.show()



In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import sklearn.preprocessing



def audio_trim(wav, length, threshold=10):
    # split an audio signal into non-silent intervals.
    non_silence_parts = librosa.effects.split(wav, top_db=threshold)
    output = np.concatenate([wav[s:e] for s, e in non_silence_parts])
    output = output[1:length+1]
    return output


if __name__ == '__main__':
    # raw audio file folder path
    audio_folder_path = '../audio/'
    # mfcc data and labels files path
    dataset_file_path = 'dataset.npy'
    label_file_path = 'labels.npy'

    # get all speakers
    speakers = os.listdir(audio_folder_path)
    print("\nspeakers:", speakers, "\nlen:", len(speakers))

    dataset = []
    labels = []

    for speaker in speakers:
        cnt = 0
        wav_files = os.listdir(audio_folder_path + speaker + '/train_data/')
        for wav_file in wav_files:
            # print(speaker, 'wav file: ', wav_file)
            if '.wav' in wav_file:
                # load wav file
                wav, sr = librosa.load(path=audio_folder_path + speaker + '/train_data/' + wav_file, sr=16000)
                # The actual speaking time is about 2s
                wav = audio_trim(wav=wav, length=16384, threshold=30)
                # extract mfcc geature
                mfcc = librosa.feature.mfcc(y=wav, sr=sr, 
                                window='hamming', 
                                win_length=512,
                                hop_length=384,
                                n_fft=2048,
                                n_mels=32,
                                n_mfcc=32)
                
                dataset.append(mfcc)
                labels.append(speaker)
                cnt += 1

                if (cnt % 60 == 0):
                    scaler = sklearn.preprocessing.MinMaxScaler()
                    normalize_mfcc = scaler.fit_transform(mfcc)
                    # print(np.square(normalize_mfcc))
                    print(speaker, np.shape(mfcc))
                    plt.imshow(np.square(normalize_mfcc))
                    plt.show()
                # print(speaker, cnt, np.shape(mfcc))

    # np.save(dataset_file_path, np.array(dataset))
    # np.save(label_file_path, np.array(labels))
   


In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import sklearn.preprocessing



def audio_trim(wav, length, threshold=10):
    # split an audio signal into non-silent intervals.
    non_silence_parts = librosa.effects.split(wav, top_db=threshold)
    output = np.concatenate([wav[s:e] for s, e in non_silence_parts])
    output = output[1:length+1]
    return output


def mfcc_normalize(mfcc:np.ndarray,scale:tuple=(0,1)):
    ret = mfcc.copy()
    min, max = mfcc.min(),mfcc.max()
    
    ret = (ret - min) / (max - min)
    ret = ret * (scale[1] - scale[0]) + scale[0]
    return ret

if __name__ == '__main__':
    # raw audio file folder path
    audio_folder_path = '../audio/'
    # mfcc data and labels files path
    dataset_file_path = 'dataset.npy'
    label_file_path = 'labels.npy'

    # get all speakers
    speakers = os.listdir(audio_folder_path)
    print("\nspeakers:", speakers, "\nlen:", len(speakers))

    # dataset = []
    # labels = []

    scaler = sklearn.preprocessing.MinMaxScaler()
    # print all details of large array
    np.set_printoptions(threshold=np.inf)

    for speaker in speakers:
        wav_files = os.listdir(audio_folder_path + speaker + '/train_data/')
        for wav_file in wav_files:
            # print(speaker, 'wav file: ', wav_file)
            if '.wav' in wav_file:
                # load wav file
                wav, sr = librosa.load(path=audio_folder_path + speaker + '/train_data/' + wav_file, sr=44100)
                # The actual speaking time is about 2s
                wav = audio_trim(wav=wav, length=8192, threshold=30)
                # extract mfcc geature (remove the first row)
                mfcc = librosa.feature.mfcc(y=wav, sr=sr,
                                window='hamming',
                                win_length=512,
                                hop_length=384,
                                n_fft=2048,
                                n_mels=8,
                                n_mfcc=8)
                
                
                print(speaker, np.shape(mfcc))
                # print(mfcc)
                # print(normalize_mfcc)
                # dataset.append(mfcc)
                # labels.append(speaker)
                normalize_mfcc = scaler.fit_transform(mfcc)
                plt.imshow(np.square(normalize_mfcc))
                plt.show()
                # print(speaker, cnt, np.shape(mfcc))

    # np.save(dataset_file_path, np.array(dataset))
    # np.save(label_file_path, np.array(labels))
   


In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import sklearn.preprocessing

if __name__ == '__main__':
    a = np.sqrt(np.arange(9).reshape(3,3))
    a_n = sklearn.preprocessing.minmax_scale(a)
    print(a)
    print(np.abs(a_n)) 
    pass