In [10]:
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.models import load_model

In [11]:
# 음성데이터 argumentation을 통해서 데이터 수 늘리기 및 오버피팅 방지
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data):
    return librosa.effects.time_stretch(data, rate=0.8)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitchs(data, sampling_rate, n_steps):
    return librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=n_steps)


In [12]:
def extract_features(data, sampling_rate):

    result = np.array([])

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    # rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    # result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    # mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sampling_rate).T, axis=0)
    # result = np.hstack((result, mel)) # stacking horizontally
    
    return result


In [59]:
# 음성의 특성을 추출한 데이터를 축적하는 함수 (Argumentation된 데이터도 같이)
def get_features(path):
    # duration과 offset은 각 오디오 파일의 시작과 끝에서 오디오가 없는 것을 처리
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # 원래데이터
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)
#     # 노이즈가 추가된 데이터
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2)) # 병렬적으로 추가

#     # 피칭및 스트레칭된 데이터
    new_data = stretch(data)
    data_stretch_pitch = pitchs(new_data, sample_rate, 0.7)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3)) # 병렬적으로 추가

    return result

In [65]:
X, Y = [], []
feature = get_features("soundData/test/su14.wav")
for ele in feature:
    X.append(ele)

In [66]:
Features = pd.DataFrame(X)

In [67]:
X = Features.values

In [68]:
X.shape

(3, 20)

In [69]:
scaler = StandardScaler()
x = scaler.fit_transform(X)

In [70]:
x.shape

(3, 20)

In [71]:
x = np.expand_dims(x, axis=2)

In [72]:
model = load_model('sound_classifier_model.h5')

In [73]:
pred_test = model.predict(x)



In [74]:
y = np.array(['anger', 'disgust', 'fear', 'happiness', 'neutral', 'sadness'])

In [75]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(y).reshape(-1,1)).toarray()

In [76]:
y_pred = encoder.inverse_transform(pred_test)
df = pd.DataFrame(columns=['Predicted Labels'])
df['Predicted Labels'] = y_pred.flatten()

In [78]:
df['Predicted Labels'][0]

0      sadness
1      neutral
2    happiness
Name: Predicted Labels, dtype: object