# Project

## General idea:
Given two songs, an original and a sampled one, we can find what has been taken and where it has been placed.

## Method:
Fingerprint: split music into 1-second segments. find matches among pairs of segments.

### Part 1 
- Convert any wav to samples
- Delimit music features to use ()

### Part 2
- Compare music features between songs
- Check where in the song features are similar (timestamps)

Methods: audio matching with fingerprints? DTW?

### Part 3
- Automation, make functions to repeat this with any song

### References

http://www.eurasip.org/Proceedings/Eusipco/Eusipco2012/Conference/papers/1569556475.pdf

http://cmmr2012.eecs.qmul.ac.uk/sites/cmmr2012.eecs.qmul.ac.uk/files/pdf/papers/cmmr2012_submission_19.pdf

Master thesis: http://mtg.upf.edu/system/files/publications/Van-Balen-Jan-Master-thesis-2011_1.pdf

In [3]:
import numpy as np
import librosa
from matplotlib import pyplot as plt
import os
from scipy import signal
from scipy.fft import fft, fftshift, fftfreq

params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

In [8]:
file_1 = "concalma_daddyyankeesnow.wav"
file_2 = "informer_snow.wav"
# file_1 = "WakaWaka-Shakira.wav"
# file_2 = "informer_snow.wav"
directory = ""

In [5]:
class Audio_fingerprint:

    def __init__(self, file, directory=""):
        self.filename = file
        self.original_audio, self.sample_rate = librosa.load(os.path.join(directory, file))
        self.number_samples = self.sample_rate * 60
        self.audio = self.original_audio[:self.number_samples]
        self.duration = librosa.get_duration(y=self.audio, sr=self.sample_rate)
        self.set_fingerprints()
        
    def stft(self, size_fft=8192, duration_ms=100, hopsize_ms=50):
        ## Compute its dB magnitude short-time Fourier transform
        window_len = int(duration_ms*self.sample_rate*1e-3) # Hann windows of duration 50 ms
        hopsize_len = int(hopsize_ms*self.sample_rate*1e-3) # window hopsize of 10 ms
        zeropadding_len = window_len * 4 # zeropadding to four times the length of the window

        ## zeropadding the signal
        audio_zp = np.lib.pad(self.audio, (int(zeropadding_len/2),int(zeropadding_len/2)), 'constant', constant_values=(0, 0))

        stft_f, stft_t, stft_amp = signal.stft(audio_zp, self.sample_rate, nperseg=window_len, noverlap=window_len-hopsize_len, nfft=size_fft, boundary=None)
        stft_db = librosa.amplitude_to_db(np.abs(stft_amp))
        
        self.stft_f, self.stft_t, self.stft_amp, self.stft_db = stft_f, stft_t, stft_amp, stft_db
        print(f"File {self.filename} STFT completed.")
        # return stft_f, stft_t, stft_amp, stft_db
    
    def set_anchors(self, nband=20, delta_t=0.5):
        import math
        self.stft()
        duration = self.stft_t[-1] - self.stft_t[0]
        step_t = math.ceil(len(self.stft_t) / duration * delta_t)
        step_f = math.ceil(len(self.stft_f) / nband)
        nband_time = math.ceil(len(self.stft_t)/step_t)

        # subindexing and finding max
        anchors = []
        for i in range(nband):
            for j in range(nband_time):
                box = self.stft_db[i * step_f : (i + 1) * step_f, j * step_t : (j + 1) * step_t]
                max_idx = np.where(box == np.max(box))
                anchors.append((max_idx[1][0] + j * step_t, max_idx[0][0] + i * step_f))
                
        self.anchors = np.array(anchors)
        print(f"File {self.filename} anchors calulated.")
        # return np.array(anchors)
    
    def set_fingerprints(self):
        self.set_anchors()
        #values from indexes
        max_time = self.stft_t[self.anchors[:, 0]]
        max_frequencies = self.stft_f[self.anchors[:, 1]]

        hash_set = []
        fingerprint_set = []
        for anchor in self.anchors:
            time_i, freq_i = anchor
            #time and frequency in base anchor
            time = self.stft_t[time_i]
            freq = self.stft_f[freq_i]
            zone_t = np.where(np.logical_and(max_time>=time+0.1, max_time<=time+0.6))
            zone_f = np.where(np.logical_and(max_frequencies>=freq*2**-0.5, max_frequencies<=freq*2**0.5))
            zone = np.intersect1d(zone_t, zone_f)
            for i in zone:
                delta_t = max_time[i] - time
                f2 = max_frequencies[i]
                hash_i = (freq, f2, delta_t)
                hash_set.append(hash_i)
                fingerprint_set.append((time, hash_i))
                
        self.fingerprint_set = tuple(fingerprint_set)
        print(f"File {self.filename} fingerprints calculated.")
        # return tuple(fingerprint_set)
        
    def match(self, fingerprint_q, eps=1e-5):
        matches = set()
        for fd_i in self.fingerprint_set:
            for fq_i in fingerprint_q:
                if (fd_i[0], fq_i[0]) in matches: continue
                h1 = fd_i[1]
                h2 = fq_i[1]
                # matching
                d = abs(h1[0] - h2[0])
                if d > eps: continue # saves a lot of time!!
                d += abs(h1[1] - h2[1])
                if d > eps: continue
                d += abs(h1[2] - h1[2])
                if d <= eps: 
                    matches.add((fd_i[0], fq_i[0]))

        return matches
    
    def segment(self, start=0.0, duration=1.0):
        fingerprint_segment = []
        for fgp in self.fingerprint_set:
            time = fgp[0]
            if start <= time <= start+duration:
                fingerprint_segment.append(fgp)

        return tuple(fingerprint_segment)
    
    def make_queries(self, duration=1.0, hop=0.2):
        self.queries = {}
        t = 0.0
        while(t < self.stft_t[-1]):
            queries[t] = self.segment(t, t+duration)
            t += hop
        # return queries

In [9]:
x = Audio_fingerprint(file_1)

File concalma_daddyyankeesnow.wav STFT completed.
File concalma_daddyyankeesnow.wav anchors calulated.
File concalma_daddyyankeesnow.wav fingerprints calculated.


In [10]:
y = Audio_fingerprint(file_2)

File informer_snow.wav STFT completed.
File informer_snow.wav anchors calulated.
File informer_snow.wav fingerprints calculated.


In [13]:
def compute_cost_matrix(X, Y, metric='euclidean'):
    """Compute the cost matrix of two feature sequences

    Notebook: C3/C3S2_DTWbasic.ipynb

    Args:
        X (np.ndarray): Sequence 1
        Y (np.ndarray): Sequence 2
        metric (str): Cost metric, a valid strings for scipy.spatial.distance.cdist (Default value = 'euclidean')

    Returns:
        C (np.ndarray): Cost matrix
    """
    from scipy import spatial
    X, Y = np.atleast_2d(X, Y)
    C = spatial.distance.cdist(X.T, Y.T, metric=metric)
    return C

In [28]:
import libfmp.c3
import libfmp.c7

In [31]:
def DTW_cost_matrix(x1, x2, ell=21, d=5):
    N_feat = 4410
    H_feat = 2205
    # ell: smoothing size
    # d: downsample rate
    C1 = librosa.feature.chroma_stft(y=x1.original_audio, sr=x1.sample_rate, tuning=0, norm=None, hop_length=H_feat, n_fft=N_feat) # 12 chroma bins * time frames with frame size = hopsize / Fs
    C2 = librosa.feature.chroma_stft(y=x2.original_audio, sr=x2.sample_rate, tuning=0, norm=None, hop_length=H_feat, n_fft=N_feat)
    
    X, Fs_cens = libfmp.c7.compute_cens_from_chromagram(C1, ell=ell, d=d)
    Y, Fs_cens = libfmp.c7.compute_cens_from_chromagram(C2, ell=ell, d=d)
    N, M = X.shape[1], Y.shape[1]
    
    C_FMP = libfmp.c3.compute_cost_matrix(X, Y, 'euclidean')
    return C_FMP

In [33]:
C = DTW_cost_matrix(x, y, ell=21, d=5)

In [34]:
np.min(C)

0.09563495700774109

In [37]:
C.shape

(387, 538)

In [41]:
N_feat = 4410
H_feat = 2205
t1 = np.where(C==np.min(C))[0][0]*5*H_feat/x.sample_rate
t2 = np.where(C==np.min(C))[1][0]*5*H_feat/x.sample_rate
print(t1, t2)

37.0 38.0


In [19]:
#get one second of the sampled song
number_samples_onesec = x.sample_rate * 1
x_second_1 = x.original_audio[0:number_samples_onesec]

number_samples_before = y.sample_rate * 30
number_samples_after = y.sample_rate * 31
y_second_30 = y.original_audio[number_samples_before:number_samples_after]


In [20]:
from dtaidistance import dtw

distances = dtw.distance_fast(y_second_30.astype('double'), y.audio.astype('double'))


In [21]:
print(distances)

130.71195348258757


In [17]:
distances1 = dtw.distance_fast(x_second_1.astype('double'), y.audio.astype('double'))


In [18]:
print(distances1)

158.8090433479043


In [27]:
x_seconds = np.array_split(x.audio, 60)
distances = []

In [28]:
for i in range(len(x_seconds)):
    print(i)
    distance = dtw.distance_fast(x_seconds[i].astype('double'), y.audio.astype('double'))
    distances.append(distance)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [29]:
print(distances)

[158.8090433479043, 151.84784815835985, 150.868678942268, 151.69639567714728, 146.81315594833296, 145.77761292832008, 151.21090938957948, 148.62746136979965, 146.65845866362412, 149.68272739614326, 135.06221419784802, 141.86971308705546, 145.42821949721446, 141.60761926139713, 142.60830448256894, 142.9939477790767, 143.49499299186576, 146.59512892376245, 142.5502008760467, 140.9348670829606, 151.9656671026756, 137.6609087351911, 137.10114867159496, 139.51780678100877, 137.13309195147204, 138.92009423698195, 137.08672743969825, 138.0714677549287, 139.25068148857008, 136.51902277636083, 126.99294985815554, 141.73526000902834, 141.83013639390802, 139.46029977099803, 140.19773356667358, 136.2843672545979, 140.24730034337256, 143.2741321363078, 140.90836747748682, 137.8984364974051, 140.41611690948218, 139.73167939378425, 141.84014418413287, 137.9137012718867, 138.80108858929566, 140.2582188167758, 144.51650507850528, 143.12675191895352, 143.7667371175437, 138.2623602349118, 143.95268569489

In [1]:
fig, ax = plt.subplots(figsize=(15,5))
ax.plot(distances)
ax.set_xlabel("Time (s)")
ax.set_ylabel("Distance")
#ax.scatter(min_idx, np.min(distances), color='r')

NameError: name 'plt' is not defined