In [1]:
import numpy as np 
import matplotlib.pyplot as plt

import wave

In [None]:
def sound_info(w):
    """
    Print sound info from .waw file
    """
    print(f"channels : {w.getnchannels()}")
    print(f"sample size : {w.getsampwidth()} bytes")
    print(f"sample frequency : {w.getframerate()} Hz")
    print(f"audio frames : {w.getnframes()}")

def normalize(w):
    """normalize w between 0 and 1"""
    a_f = w.astype(np.float32)
    maximum = np.max(a_f)
    minimum = np.min(a_f)
    return (a_f-minimum)/(maximum - minimum)

def get_amplitude(w):
    """get amplitude vector of w sample"""
    return np.frombuffer(w, np.int16)

def plot_audio(t,w,N):
    plt.plot(t, w[:N])

def audio_segmentation(w, segment_size, seuil, overlap):
    """
    segment waw file
    """

    # every segments of w
    res = []

    nb_segment = w.getnframes() // segment_size

    # all_powerl = []
    # all_powerr = []
    pos = 0
    shift = int(overlap*segment_size)

    for _ in range(nb_segment):

        # len(frame_vec) is segment_size*2 because 2 channels (stereo)
        frame_vector = get_amplitude(w.readframes(segment_size))
        frame_mat = frame_vector.reshape((len(frame_vector)//2, 2))

        # overlaping
        pos+=segment_size - shift
        w.setpos(pos)
        
        left = normalize(frame_mat[:,0])
        right = normalize(frame_mat[:,1])

        power_left = calcul_signal_power(left)
        power_right = calcul_signal_power(right)

        # all_powerl.append(power_left)
        # all_powerr.append(power_right)

        # we keep only barking
        if power_left > seuil and power_right > seuil:
            seg = np.array([left,right]).T
            res.append(seg)

    # print(np.min(all_powerl), np.max(all_powerl))
    # print(np.min(all_powerr), np.max(all_powerr))

    return res

def calcul_signal_power(w):
    """mean of squarred amplitudes"""
    return np.mean(w**2)

def bark_quality(w, segments, segment_size):
    """
    Calcul bark quality of w
    i.e the number of segments with enough power divided by number of segments
    """
    nb_segments = w.getnframes() // segment_size

    return len(segments)/nb_segments

def rebuild_signal(segments, segment_size, framerate):
    """
    Rebuild signal from its segments as an amplitude matrix (stereo)
    """

    segs = np.array(segments) # 3D tensor

    nb_frames = segment_size*len(segments)

    res = segs.reshape(nb_frames, 2) # 2D tensor (matrix)

    duration = nb_frames/framerate

    return res, duration

def embedd_segment(seg, segment_size, framerate):
    period = 1/framerate
    seg_centered = seg - 0.5 # [0,1] -> [-0.5, 0.5] for zcr
    eps = 1e-8

    fft = np.fft.fft(seg_centered, segment_size)
    freqs = np.fft.fftfreq(segment_size, period)

    # fft is symmetric, we take positive part
    p_fft = fft[:segment_size//2]
    p_freqs = freqs[:segment_size//2]
    magnitude = np.abs(p_fft)


    # time-domain features
    rms = np.sqrt(calcul_signal_power(seg))
    zcr = np.sum(np.diff(np.sign(seg_centered)) != 0)/segment_size
    entropy = -np.sum(seg_centered**2 * np.log(seg_centered**2 + eps)) # avoid log(0)

    # frequency-domain features
    centroid = np.sum(p_freqs*magnitude)/(np.sum(magnitude) + eps)
    flatness = np.exp(np.mean(np.log(magnitude + eps))) / (np.mean(magnitude) + eps)

    best_idx = np.argmax(magnitude)
    peak_freq = p_freqs[best_idx]
    

    return np.array([rms, zcr, entropy, centroid, flatness, peak_freq])


def cosine_dist(u, v):
    return (u@v)/(np.linalg.norm(u)*np.linalg.norm(v))

def euclidean_dist(u,v):
    A = u-v
    return np.sqrt(A@A)





In [None]:
audio = {"nina": [], "rebecca": []}

buffer_size = 1024
bark_seuil = 0.25

for filename in ["nina", "rebecca", "rebecca2"]:      

    with wave.open(f"{filename}.wav", "rb") as w:
        print(f"{filename}.wav bark sound info :")
        sound_info(w)

        segments = audio_segmentation(w, buffer_size, bark_seuil, 0.5)
        print(bark_quality(w, segments, buffer_size))

        print("")

        # keeping only left audio channel because i don't need spatial informations
        f = lambda seg : seg[:,0]
        left_segs = list(map(f, segments))


        g = lambda seg : embedd_segment(seg, buffer_size, w.getframerate())
        embedded_segs = list(map(g, left_segs))

        if filename.startswith("nina"):

            audio["nina"] = embedded_segs
        else:
            audio["rebecca"] = embedded_segs
    



nina.wav bark sound info :
channels : 2
sample size : 2 bytes
sample frequency : 48000 Hz
audio frames : 16384
0.75

rebecca.wav bark sound info :
channels : 2
sample size : 2 bytes
sample frequency : 48000 Hz
audio frames : 22528
0.6818181818181818

rebecca2.wav bark sound info :
channels : 2
sample size : 2 bytes
sample frequency : 48000 Hz
audio frames : 133120
0.7230769230769231



2

In [40]:
ex = np.array(audio["nina"])
ex2 = np.array(audio["rebecca"])

np.std(ex, axis=0)



array([3.21251729e-02, 9.45237338e-03, 1.78834808e+01, 4.23342763e+02,
       6.44755331e-02, 3.17825713e+02])

In [41]:
# similar angle but far away -> not ok
de = []
dc = []
for u in ex:
    for v in ex:
        de.append(euclidean_dist(u,v))
        dc.append(cosine_dist(u,v))


print(np.mean(de))
print(np.mean(dc))




646.4567562166305
0.98738520713805


In [None]:
# similar angle but far away -> not ok
de = []
dc = []
for u in ex:
    for v in ex2:
        de.append(euclidean_dist(u,v))
        dc.append(cosine_dist(u,v))


print(np.mean(de))
print(np.mean(dc))


1448.524621035019
0.9716874996150804


## Conclusion

Pour faire un KNN, cela ne sera pas possible par ce moyen.  

Un segment du chien A est "proche" d'un segment du chien B, et un segment du chien A est "loin" d'un segment du chien A.  
Par conséquent, ma segmentation n'est peut-être pas bonne, ou bien je n'extrais pas les bonnes features, ou bien un modèle bien plus complexe est nécessaire.  

En revanche, j'ai trouvé un modèle qui serait peut-être utilisable pour mon problème :
https://arxiv.org/html/2404.18739v1#S4