# Audio Alignment for Harmonix Set

This notebook tries to align purchased audio with original audio from Harmonix. 

More specifically, for each pair of audio files:
- Load both audio files
- Compute chromagrams
- Use DTW to find the correct start and end points of alignment
- Produce the new aligned mp3s from the purchased audio

In [None]:
from __future__ import print_function
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

import librosa
from librosa import display

ORIG_MP3_PATH = "/Users/onieto/Desktop/Harmonix/audio/"
PURC_MP3_PATH = "/Users/onieto/Dropbox/drop/HarmonixMP3_YouTube/"
METADATA_TSV = "../dataset/metadata.csv"
N_FFT = 4096
HOP_SIZE = 1024
SR = 22050
N_MELS = 80

%matplotlib inline

In [None]:
# Load metadata
meta_df = pd.read_csv(METADATA_TSV, sep=",")
meta_df.head()

In [None]:
# Load audio
file_id = meta_df["File"].iloc[7]
print(file_id)
orig_path = os.path.join(ORIG_MP3_PATH, file_id + ".mp3")
purc_path = os.path.join(PURC_MP3_PATH, file_id + ".mp3")
orig_x, _ = librosa.load(orig_path, sr=SR)
purc_x, _ = librosa.load(purc_path, sr=SR)

In [None]:
orig_mel = librosa.power_to_db(librosa.feature.melspectrogram(y=orig_x, sr=SR, hop_length=HOP_SIZE, n_mels=N_MELS))
purc_mel = librosa.power_to_db(librosa.feature.melspectrogram(y=purc_x, sr=SR, hop_length=HOP_SIZE, n_mels=N_MELS))

In [None]:
# Apply DTW
D, wp = librosa.sequence.dtw(X=orig_mel, Y=purc_mel, metric='cosine')
wp_s = np.asarray(wp) * HOP_SIZE / SR

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
librosa.display.specshow(D, x_axis='time', y_axis='time',
                         cmap='gray_r', hop_length=HOP_SIZE)
imax = ax.imshow(D, cmap=plt.get_cmap('gray_r'),
                 origin='lower', interpolation='nearest', aspect='auto')
ax.plot(wp_s[:, 1], wp_s[:, 0], marker='o', color='r')
plt.title('Warping Path on Acc. Cost Matrix $D$')
plt.colorbar()

In [None]:
# Reconstruct signal:
# We basically take as many frames as the original signal and get the 
# closest to each of these frames from the purchased signal

orig_dict = {}
for w in wp[::-1]:
    orig_dict[w[0]] = w[1]
y = []
for i in range(len(orig_dict)):
    samp = orig_dict[i] * HOP_SIZE
    y += list(purc_x[samp:samp + HOP_SIZE])
last_samp = samp + HOP_SIZE
y += list(purc_x[last_samp:last_samp + (len(orig_x) - len(y))])

In [None]:
import IPython
IPython.display.Audio(data=y, rate=SR)

In [None]:
librosa.output.write_wav("/Users/onieto/Desktop/test2.mp3", np.asarray(y), sr=SR)