In [5]:
import numpy as np
from sklearn.mixture import GaussianMixture
import librosa

In [6]:
class Model:
    def __init__(self, parameter):
        self.pre_emphasis_coeff = parameter[0]
        self.frame_rate = parameter[1]
        self.window_size = int(parameter[2] * self.frame_rate / 1000)
        self.window_shift = int(parameter[3] * self.frame_rate / 1000)
        self.n_mfcc = parameter[4]
        self.train_file_name = parameter[5]
        self.test_file_name = parameter[6]
        self.test_ref_file_name = parameter[7]

        self.len_max = 0
        self.len_mean = 0

        self.gmm = GaussianMixture(n_components=2)

    def set_len(self, method="mean"):
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")

                wav, frame_rate = librosa.load("data/wav16k/train/" + file_path + ".wav", sr=self.frame_rate, mono=True)
                self.len_mean += len(wav)
                self.len_max = max(self.len_max, len(wav))

        self.len_mean //= 10000

        if method == "max":
            self.target_length = self.len_max
        elif method == "mean":
            self.target_length = self.len_mean

    def fit(self):
        X_train = []
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                wav, frame_rate = librosa.load("data/wav16k/train/" + file_path + ".wav", sr=self.frame_rate, mono=True)

                if len(wav) < self.target_length:
                    padding = np.zeros(self.target_length - len(wav))
                    wav = np.concatenate([wav, padding])
                else:
                    wav = wav[:self.target_length]

                pre_emphasis = self.pre_emphasis_coeff
                wav = librosa.effects.preemphasis(wav, coef=pre_emphasis)

                n_fft = 400
                hop_length = 160
                n_mels = 40

                mel_spec = librosa.feature.melspectrogram(y=wav, sr=frame_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
                log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

                n_mfcc = 40
                mfcc = librosa.feature.mfcc(S=log_mel_spec, n_mfcc=n_mfcc)
                mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)

                X_train.extend(mfcc.T)

        X_train = np.array(X_train)
        self.gmm.fit(X_train)

    def predict(self):
        X_test = []
        with open("data/" + self.test_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")

                wav, frame_rate = librosa.load("data/wav16k/test/" + file_path + ".wav", sr=self.frame_rate, mono=True)

                if len(wav) < self.target_length:
                    padding = np.zeros(self.target_length - len(wav))
                    wav = np.concatenate([wav, padding])
                else:
                    wav = wav[:self.target_length]

                pre_emphasis = self.pre_emphasis_coeff
                wav = librosa.effects.preemphasis(wav, coef=pre_emphasis)

                n_fft = 400
                hop_length = 160
                n_mels = 40

                mel_spec = librosa.feature.melspectrogram(y=wav, sr=frame_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
                log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

                n_mfcc = 40
                mfcc = librosa.feature.mfcc(S=log_mel_spec, n_mfcc=n_mfcc)
                mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)

                X_test.append(mfcc.T)

        X_test = np.array(X_test)
        rate = 0
        with open("data/" + self.test_ref_file_name, 'r') as file:
            for i, txt in enumerate(file):
                _, gender = txt.rstrip("\n").split()

                labels = self.gmm.predict(X_test[i])
                r = sum(labels) / len(labels)

                if gender == "male":
                    if r >= 0.5:
                        rate += 1
                else:
                    if r < 0.5:
                        rate += 1

        print(rate / 1000)

In [7]:
parameters = [[0.97],
              [16000],
              [25],
              [10],
              list(range(10, 101)),
              ["fmcc_train.ctl"],
              ["fmcc_test.ctl"],
              ["fmcc_test_ref.txt"]]

In [8]:
for i in range(40, 41):
    parameter = [0.97,
                 16000,
                 25,
                 i,
                 100,
                 "fmcc_train.ctl",
                 "fmcc_test.ctl",
                 "fmcc_test_ref.txt"]

    print("n_mfcc:" + str(i))
    model = Model(parameter)
    model.set_len("mean")
    model.fit()
    model.predict()

n_mfcc:40
0.5
