In [161]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import random
import json
import os
import math

In [176]:
def get_files(path):
    # pp = "/net/projects/scratch/winter/valid_until_31_July_2022/krumnack/animal-communication-data/Chimp_IvoryCoast/manually_verified_2s/chimp_only_23112020_with_ids"
    files = [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path,f)) and f.endswith('.wav')]
    return files


def sanitize(files):
    ds =  {'file':[], "class":[]}

    for f in files:

        ds['file'].append(f)
        ds['class'].append(f[f.rfind("/")+12:][:f[f.rfind("/")+12:].index('_')])
        
    #next we want to select files that belong to class >=  100 samples
    
    # pd.Series(data['class'].unique()).sort_values()
    l = ['kub','woo','rom','jac','sum','kub-phsm','kub-phtbsm','uta','ish-phsm','jul','rom-phsm','kub-phtb','woo-phsm']
    
    data = pd.DataFrame(ds)
    
    #next we concatinate the selected samples per class into a single data frame
    
    ddf = data.loc[data['class'] == l[0]].copy()
    for i in range(1,len(l)):
        ddf = pd.concat([ddf, data.loc[data['class'] == l[i]]], axis = 0)
    return ddf


def save_mfcc(df, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along witgh genre labels.
        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save MFCCs
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": [],
        'melspec': []
    }

    for k in range(df.shape[0]):
        r =  df.iloc[k]
        
        #   Loading the audio file 
        signal, sample_rate = librosa.load(r['file'], sr=SAMPLE_RATE)
        
        # extract mfcc
        mfcc = librosa.feature.mfcc(signal, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
        
        
        mel_spectrogram = librosa.feature.melspectrogram(signal,sr=sample_rate,n_fft=n_fft,hop_length=hop_length,n_mels=num_mfcc)
        log_mel_spectrogram =  librosa.power_to_db(mel_spectrogram,ref=np.max)
        mfcc = mfcc.T
        spec = log_mel_spectrogram.T
        w,h = 87, 13
        if(mfcc.shape[0] == w and mfcc.shape[1] == h):
            data["mapping"].append(r['class'])
            data['labels'].append(l.index(r['class']))
            data["mfcc"].append(mfcc.tolist())
            data["melspec"].append(spec.tolist())
        
    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [4]:
paths = "/net/projects/scratch/winter/valid_until_31_July_2022/krumnack/animal-communication-data/Chimp_IvoryCoast/manually_verified_2s/chimp_only_23112020_with_ids"
files =  get_files(paths)

In [163]:
df = sanitize(files)

In [168]:
JSON_PATH = "data_10.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [177]:
save_mfcc(ddf, JSON_PATH, num_segments=10)

In [178]:
df.to_csv('sanitized_vocalization', encoding='utf-8', index=False)