In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
import librosa
from scipy.signal import stft
import matplotlib.pyplot as plt
import scipy.fftpack as fft
import seaborn as sns
from sklearn.decomposition import PCA

In [2]:
class Model:
    def __init__(self, parameter):
        self.pre_emphasis_coeff = parameter[0]
        self.frame_rate = parameter[1]
        self.window_size = int(parameter[2] * self.frame_rate / 1000)
        self.window_shift = int(parameter[3] * self.frame_rate / 1000)
        self.n_mfcc = parameter[4]
        self.train_file_name = parameter[5]
        self.test_file_name = parameter[6]
        self.test_ref_file_name = parameter[7]
        
        self.gmm = GaussianMixture(n_components=2)
        
    def set_len(self, method="mean"):
        len_mean = 0
        len_max = 0
        len_min = 1000000000
        
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")

                wav, frame_rate = librosa.load("data/wav16k/train/" + file_path + ".wav", sr=self.frame_rate, mono=True)
                len_mean += len(wav)
                len_max = max(len_max, len(wav))
                len_min = min(len_max, len(wav))
        
        len_mean //= 10000
        
        if method == "max":
            self.target_length = len_max
        elif method == "mean":
            self.target_length = len_mean
        elif method == "min":
            self.target_length = len_min
            
    
    def getMfcc(self, wav):
        mfcc = librosa.feature.mfcc(y=wav, 
                                    sr=16000,
                                    n_fft=400,
                                    hop_length=160,
                                    n_mfcc=40)
                

        min_val = np.min(mfcc)
        max_val = np.max(mfcc)
        mfcc = (mfcc - min_val) / (max_val - min_val)

        
        n_components = 1  # 축소할 차원 수 지정
        pca = PCA(n_components=n_components)
        
        
        mfcc = pca.fit_transform(mfcc)
        
        return mfcc.T
    
    
    def fit(self):
        X_train = []
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                wav, frame_rate = librosa.load("data/wav16k/train/" + file_path + ".wav", sr=self.frame_rate, mono=True)

                wav1 = librosa.util.fix_length(wav, size=self.target_length)
                
                
                mfcc1 = self.getMfcc(wav1)
                
                X_train.extend(mfcc1)
        
        self.gmm.fit(X_train)
        
        
    
    def predict(self):
        X_test = []
        with open("data/" + self.test_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                
                wav, frame_rate = librosa.load("data/wav16k/test/" + file_path + ".wav", sr=self.frame_rate, mono=True)
                
                wav = librosa.util.fix_length(wav, size=self.target_length)
                
                mfcc = self.getMfcc(wav)

                X_test.append(mfcc)
        
        rate = 0
        with open("data/" + self.test_ref_file_name, 'r') as file:
            for i, txt in enumerate(file):
                _, gender = txt.rstrip("\n").split()
                
                label = self.gmm.predict(X_test[i]) 
                    
                if gender == "male":
                    if label == 0:
                        rate += 1
                else:
                    if label == 1:
                        rate += 1

        print(rate/9)
        print(rate)

In [3]:
parameters = [[0.97], 
              [16000], 
              [25], 
              [10], 
              list(range(10, 101)), 
              ["fmcc_train.ctl"], 
              ["fmcc_test.ctl"], 
              ["fmcc_test_ref.txt"]]

In [4]:
for i in range(40, 41):
    parameter = [0.97, 
                 16000, 
                 25, 
                 10, 
                 i, 
                 "fmcc_train.ctl", 
                 "fmcc_test.ctl", 
                 "fmcc_test_ref.txt"]
    
    print("n_mfcc:" + str(i))
    model = Model(parameter)
    model.set_len("mean")
    model.fit()
    model.predict()

n_mfcc:13
27.88888888888889
251
n_mfcc:14
27.88888888888889
251
n_mfcc:15
71.88888888888889
647
n_mfcc:16
71.55555555555556
644
n_mfcc:17
72.22222222222223
650
n_mfcc:18
71.11111111111111
640
n_mfcc:19
70.88888888888889
638
n_mfcc:20
71.22222222222223
641
n_mfcc:21
71.44444444444444
643
n_mfcc:22
28.88888888888889
260
n_mfcc:23
29.0
261
n_mfcc:24
29.0
261
n_mfcc:25
70.88888888888889
638
n_mfcc:26
71.33333333333333
642
n_mfcc:27
28.444444444444443
256
n_mfcc:28
71.77777777777777
646
n_mfcc:29
28.11111111111111
253
n_mfcc:30
71.77777777777777
646
n_mfcc:31
71.44444444444444
643
n_mfcc:32
71.33333333333333
642
n_mfcc:33
28.333333333333332
255
n_mfcc:34
71.33333333333333
642
n_mfcc:35
28.88888888888889
260
n_mfcc:36
71.66666666666667
645
n_mfcc:37
71.55555555555556
644
n_mfcc:38
71.44444444444444
643
n_mfcc:39
27.88888888888889
251
n_mfcc:40
27.666666666666668
249
n_mfcc:41
72.55555555555556
653
n_mfcc:42
72.88888888888889
656
n_mfcc:43
72.66666666666667
654
n_mfcc:44
72.88888888888889
656

In [5]:
import librosa
import soundfile as sf

# 음성 데이터 로드
audio, sr = librosa.load('data/wav16k/train/FCJY0/FCJY0_pbw1001.wav', sr=16000)

# 피치 이동 비율 설정
pitch_shift = 2  # 양수 값은 피치를 높이고, 음수 값은 피치를 낮춥니다.

# 피치 이동 적용
shifted_audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift)

# 변조된 음성 데이터를 파일로 저장
sf.write('output.wav', shifted_audio, sr)

In [6]:
import librosa
import numpy as np
from sklearn.decomposition import PCA

# 음성 데이터 로드
audio_path = 'data/wav16k/train/FCJY0/FCJY0_pbw1001.wav'
y, sr = librosa.load(audio_path, sr=16000, mono=True)

# Mel 스케일된 스펙트로그램 계산
mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)

# 로그 스케일 변환
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

n_components = 1  # 축소할 차원 수 지정
pca = PCA(n_components=n_components)

# PCA 적용
data = pca.fit_transform(log_mel_spectrogram.T)

# 결과 확인
print(data.shape) 

(31, 1)


In [72]:
import librosa
import numpy as np


# 음성 데이터 로드
audio_path = 'data/wav16k/train/FCJY0/FCJY0_pbw1001.wav'
y, sr = librosa.load(audio_path, sr=16000, mono=True)

# Mel 스케일된 스펙트로그램 계산
mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)

# 로그 스케일 변환
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

# 결과 확인
print(log_mel_spectrogram.shape)

(128, 31)
