## Loading Dependency

In [39]:
import numpy as np
import soundfile as sf
import pandas as pd
import scipy as sp
import os
import librosa

## Loading Data


In [43]:
def data_loader(path):
    """
        Loading data from the dataset. 
        returns:
            data: list of audio files as numpy arrays
            labels: list of labels for each audio file
            sr: sample rate of the audio files
    """


    # Load the data
    data = [sf.read(path + p)[0] for p in os.listdir(path) if p.endswith('.mp3')]
    # Load the labels
    labels = [p.split('_')[-1][:-4:1] for p in os.listdir(path) if p.endswith('.mp3')]

    sr = sf.read(path + os.listdir(path)[0])[1]

    return data, labels ,sr

train_data, train_labels , sr = data_loader('data/train/')

print(len(train_data))
print(len(train_labels))
print(sr)

330
330
16000


## Feature Extraction


In [45]:
encoded_train_data[0].shape


(13, 73)

## HMM Model

In [None]:
import torch

class HMM(torch.nn.Module ):
    """
    Hidden Markov Model with discrete transition probability
    and multivariate Gaussian Emission Probabilities.
    """
    def __init__(self, n_states,n_features):
        super(HMM, self).__init__()
        # Number of states
        self.n_states = n_states
        # number of features
        self.n_features = n_features

        # Transition model in log space
        self.transition_model = torch.nn.Parameter(torch.rand(n_states, n_states))
        
        # Emission model
        self.emoission_model = EmissionModel(n_states,n_features)

        # Initial state
        self.initial_state = torch.nn.Parameter(torch.rand(n_states))

        
        self.cuda_available = torch.cuda.is_available()
        if self.cuda_available:
            self.device = torch.device('cuda')
            self.cuda()  


    def encode(self, data ,sr = 16000 , frame_length = 30, hop_length = 10):
        """
            Encode the data using librosa
            params:
                data: list of one audio file as numpy arrays
                sr: sample rate of the audio files, detault 16000
                frame_length: length of the frame in milliseconds  
                hop_length: hop length in milliseconds

            returns:
                encoded_data: list of encoded data as T x n_features matrix
                tau : length of the frame in samples
        """
        n_fft = int(sr * frame_length / 1000)
        hop_length = int(sr * hop_length / 1000)

        encoded_data = librosa.feature.mfcc(y=data, sr=sr,n_fft=n_fft,hop_length=hop_length,n_mfcc=self.n_features) 

        tau = n_fft
        
        return encoded_data.T , tau

    
    def train(self, data, n_iter = 100):
        """
            Train the HMM model using Expectation Maximization
            i.e. the Baum-Welch algorithm.
            params:
                data: audio files as numpy array
                n_iter: number of EM iterations for training
        """

        # Perform MFCC encoding
        data , tau = self.encode(data)

        T , n_features_data = data.shape

        assert n_features_data == self.n_features

        # Send data to GPU if available
        if self.cuda_available:
            data = torch.tensor(data).to(self.device)

        # Initialize the model
        self.initialize_initial_state()

        # Initialize the transition model
        self.initialize_transition_model()

        # Initialize the emission model
        self.initialize_emission_model()

        for i in range(n_iter):
            for x in data:
                pass


class EmissionModel(torch.nn.Module):
    """Emmision model for the HMM"""
    def __init__(self,n_states,n_features):
        super(EmissionModel, self).__init__()
        self.n_states = n_states
        self.n_features = n_features

        self.gaussians = [torch.distributions.MultivariateNormal(torch.rand(n_features),torch.rand(n_features,n_features)) for i in range(n_states)]