# Audio Alignment for Harmonix Set

This notebook tries to align purchased audio with original audio from Harmonix. 

More specifically, for each pair of audio files:
- Load both audio files
- Compute chromagrams
- Use DTW to find the correct start and end points of alignment
- Produce the new aligned mp3s from the purchased audio

In [None]:
from __future__ import print_function
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

import librosa
from librosa import display

ORIG_MP3_PATH = "/Users/onieto/Desktop/Harmonix/audio/"
PURC_MP3_PATH = "/Users/onieto/Dropbox/drop/HarmonixMP3_YouTube/"
METADATA_TSV = "../dataset/metadata.csv"
N_FFT = 2048
HOP_SIZE = 512
SR = 22050

%matplotlib inline

In [None]:
# Load metadata
meta_df = pd.read_csv(METADATA_TSV, sep=",")
meta_df.head()

In [None]:
# Load audio
file_id = meta_df["File"].iloc[0]
orig_path = os.path.join(ORIG_MP3_PATH, file_id + ".mp3")
purc_path = os.path.join(PURC_MP3_PATH, file_id + ".mp3")
orig_x, _ = librosa.load(orig_path, sr=SR)
purc_x, _ = librosa.load(purc_path, sr=SR)

In [None]:
# Compute chroma features
orig_chroma = librosa.feature.chroma_stft(y=orig_x, sr=SR, tuning=0, norm=2,
                                          hop_length=HOP_SIZE, n_fft=N_FFT)
purc_chroma = librosa.feature.chroma_stft(y=purc_x, sr=SR, tuning=0, norm=2,
                                          hop_length=HOP_SIZE, n_fft=N_FFT)

In [None]:
# Apply DTW
D, wp = librosa.sequence.dtw(X=orig_chroma, Y=purc_chroma, metric='cosine')
wp_s = np.asarray(wp) * HOP_SIZE / SR

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
librosa.display.specshow(D, x_axis='time', y_axis='time',
                         cmap='gray_r', hop_length=HOP_SIZE)
imax = ax.imshow(D, cmap=plt.get_cmap('gray_r'),
                 origin='lower', interpolation='nearest', aspect='auto')
ax.plot(wp_s[:, 1], wp_s[:, 0], marker='o', color='r')
plt.title('Warping Path on Acc. Cost Matrix $D$')
plt.colorbar()

In [None]:
wp[::-1][:10]

In [None]:
# Reconstruct signal
y = []
prev_samp = 0
for orig_i, purc_i in wp[::-1]:
    orig_samp = orig_i * HOP_SIZE
    y += list(purc_x[prev_samp:orig_samp])
    prev_samp = orig_samp
y += list(purc_x[prev_samp:prev_samp + (len(orig_x) - len(y))])

In [None]:
len(orig_x)

In [None]:
np.asarray(y).shape

In [None]:
import IPython
IPython.display.Audio(data=y, rate=SR)