In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
import librosa
from scipy.signal import stft
import matplotlib.pyplot as plt

In [2]:
# import os, wave
# from tqdm import tqdm
#
# def changeWavData(file_name, mode):
#         with open("data/" + file_name, 'r') as file:
#             for file_path in tqdm(file):
#                 file_path = file_path.rstrip("\n")
#                 try:
#                     # if not os.path.exists("data/wav16k/" + mode):
#                     #     os.makedirs("data/wav16k/" + mode)
#                     if not os.path.exists("data/wav16k/" + mode + "/" + file_path.split("/")[0]):
#                         os.makedirs("data/wav16k/" + mode + "/" + file_path.split("/")[0])
#                 except OSError:
#                     print("Error: Failed to create the directory.")
#
#                 with wave.open("data/wav16k/" + mode + "/" + file_path + ".wav", "wb") as wav_file:
#                     wav_file.setsampwidth(2)
#                     wav_file.setframerate(16000)
#                     wav_file.setnchannels(1)
#
#                     with open("data/raw16k/" + mode + "/" + file_path + ".raw", "rb") as raw_file:
#                         raw_data = raw_file.read()
#
#                     wav_file.writeframes(raw_data)
#
# # changeWavData("fmcc_test.ctl", "test")
# changeWavData("fmcc_train.ctl", "train")

In [3]:
class Model:
    def __init__(self, parameter):
        self.pre_emphasis_coeff = parameter[0]
        self.frame_rate = parameter[1]
        self.window_size = int(parameter[2] * self.frame_rate / 1000)
        self.window_shift = int(parameter[3] * self.frame_rate / 1000)
        self.n_mfcc = parameter[4]
        self.train_file_name = parameter[5]
        self.test_file_name = parameter[6]
        self.test_ref_file_name = parameter[7]

        self.len_max = 0
        self.len_mean = 0

        self.gmm = GaussianMixture(n_components=2, covariance_type='diag', max_iter=100)

    def set_len(self, method="mean"):
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                wav, frame_rate = librosa.load("data/wav16k/train/" + file_path + ".wav", sr=self.frame_rate, mono=True)
                self.len_mean += len(wav)
                self.len_max = max(self.len_max, len(wav))

        self.len_mean //= 10000

        if method == "max":
            self.target_length = self.len_max
        elif method == "mean":
            self.target_length = self.len_mean

    def extract_mfcc(self, wav):
        pre_emphasis = self.pre_emphasis_coeff
        wav = librosa.effects.preemphasis(wav, coef=pre_emphasis)

        n_fft = 400
        hop_length = 160
        n_mels = 40

        mel_spec = librosa.feature.melspectrogram(y=wav, sr=self.frame_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        n_mfcc = self.n_mfcc
        mfcc = librosa.feature.mfcc(S=log_mel_spec, n_mfcc=n_mfcc)

        return mfcc

    def process_wav(self, file_path):
        wav, frame_rate = librosa.load(file_path, sr=self.frame_rate, mono=True)

        if len(wav) < self.target_length:
            padding = np.zeros(self.target_length - len(wav))
            wav = np.concatenate([wav, padding])
        else:
            wav = wav[:self.target_length]

        mfcc = self.extract_mfcc(wav)

        return mfcc

    def fit(self):
        X_train = []
        with open("data/" + self.train_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                mfcc = self.process_wav("data/wav16k/train/" + file_path + ".wav")
                X_train.append(mfcc)

        X_train = np.concatenate(X_train)
        self.gmm.fit(X_train)

    def predict(self):
        X_test = []
        with open("data/" + self.test_file_name, 'r') as file:
            for file_path in file:
                file_path = file_path.rstrip("\n")
                mfcc = self.process_wav("data/wav16k/test/" + file_path + ".wav")
                X_test.append(mfcc)

        X_test = np.concatenate(X_test)
        probabilities = self.gmm.predict_proba(X_test)
        class_probabilities = np.mean(probabilities, axis=0)

        return class_probabilities

In [4]:
# parameters = [[0.97],
#               [16000],
#               [25],
#               [10],
#               list(range(10, 101)),
#               ["fmcc_train.ctl"],
#               ["fmcc_test.ctl"],
#               ["fmcc_test_ref.txt"]]

In [5]:
# for i in range(40, 41):
#     parameter = [0.97,
#                  16000,
#                  25,
#                  i,
#                  100,
#                  "fmcc_train.ctl",
#                  "fmcc_test.ctl",
#                  "fmcc_test_ref.txt"]
#
#     print("n_mfcc:" + str(i))
#     model = Model(parameter)
#     model.set_len("mean")
#     model.fit()
#     model.predict()

parameters = [0.97, 16000, 25, 40, 100, "fmcc_train.ctl", "fmcc_test.ctl", "fmcc_test_ref.txt"]

print("n_mfcc:" + str(parameters[3]))
model = Model(parameters)
model.set_len("mean")
model.fit()
model.predict()

n_mfcc:40


array([0.89989012, 0.10010988])