# Audio Alignment for Harmonix Set

This notebook tries to align purchased audio with original audio from Harmonix. 

More specifically, for each pair of audio files:
- Load both audio files
- Compute chromagrams
- Use DTW to find the correct start and end points of alignment
- Produce the new aligned mp3s from the purchased audio

In [18]:
from __future__ import print_function
import glob
import IPython
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

import librosa
from librosa import display
from tqdm import tqdm_notebook as tqdm

# ORIG_MP3_PATH = "/Users/onieto/Desktop/Harmonix/audio/"
# PURC_MP3_PATH = "/Users/onieto/Dropbox/drop/HarmonixMP3_YouTube/"
ORIG_MP3_PATH = "/home/uri/Dropbox/drop/HarmonixMP3_original/"
PURC_MP3_PATH = "/home/uri/Dropbox/drop/HarmonixMP3_YouTube/"
METADATA_TSV = "../dataset/metadata.csv"
OUT_DIR = "aligned_mp3s"
N_FFT = 8192
HOP_SIZE = 1024
METRIC = "euclidean"
SR = 22050
N_MELS = 90

%matplotlib inline

In [19]:
# Load metadata
meta_df = pd.read_csv(METADATA_TSV, sep=",")
meta_df.head()

Unnamed: 0,File,Title,Artist,Release,Duration,BPM,Ratio Bars in 4,Time Signature,Genre,MusicBrainz Id,Acoustid Id
0,0001_12step,"1, 2 Step",Ciara,Goodies,142.47,113,100.0,4|4,R&B,0408655f-189f-371b-9c41-ec861e1a7810,4708e4ae-a3eb-4b7a-b701-ff3a142b2bcb
1,0003_6foot7foot,6 Foot 7 Foot,Lil Wayne,Tha Carter IV,157.347,84,100.0,4|4,Hip-Hop,83347ae2-5def-378a-a3f5-96ec56c25ab7,
2,0004_abc,ABC,The Jackson 5,Hits for Kids Pop Party 8,180.955,94,94.594595,4|4,Pop-Rock,5f1604ed-5c6b-4a85-8391-15aa61ae7f98,88ddde1c-8009-497a-b295-e61125bb5162
3,0005_again,Again,Flyleaf,Memento Mori,192.067,78,0.0,6|8,Alternative,09aed1ac-4094-3337-86ef-8303531d57f1,
4,0006_aint2proud2beg,Ain’t 2 Proud 2 Beg,TLC,Now & Forever: The Hits,181.034,105,100.0,4|4,R&B,09723bc0-b3e9-4f86-a563-c80d25df049e,


In [20]:
def alignment_score(dtw_curve):
    """The alignment score is simply the average of the difference of 
    the _purchased_ track's DTW alignment curve."""
    return np.mean(np.diff(dtw_curve[:,1][::-1]))

def reconstruct_signal(orig_x, purc_x, dtw_curve):
    """Reconstructs the signal from the purchased signal using the most similar frames
    from the original signal.
    
    We basically take exactly as many frames as the original signal and get the 
    closest to each of these frames from the purchased signal given the dtw curve."""
    orig_dict = {}
    for w in dtw_curve[::-1]:
        orig_dict[w[0]] = w[1]
    y = []
    for i in range(len(orig_dict)):
        samp = orig_dict[i] * HOP_SIZE
        y += list(purc_x[samp:samp + HOP_SIZE])
    last_samp = samp + HOP_SIZE
    y += list(purc_x[last_samp:last_samp + (len(orig_x) - len(y))])
    return y

def compute_alignment(file_id, align_thres=0.9, is_plot=False):
    """Main function to do the alignment between two songs of the same id.
    """
    # Load mp3s
    orig_path = os.path.join(ORIG_MP3_PATH, file_id + ".mp3")
    purc_path = os.path.join(PURC_MP3_PATH, file_id + ".mp3")
    orig_x, _ = librosa.load(orig_path, sr=SR)
    purc_x, _ = librosa.load(purc_path, sr=SR)
    
    # Compute melspecs
    orig_mel = librosa.power_to_db(
        librosa.feature.melspectrogram(y=orig_x, sr=SR, hop_length=HOP_SIZE, n_mels=N_MELS))
    purc_mel = librosa.power_to_db(
        librosa.feature.melspectrogram(y=purc_x, sr=SR, hop_length=HOP_SIZE, n_mels=N_MELS))
    
    # Apply DTW
    D, wp = librosa.sequence.dtw(X=orig_mel, Y=purc_mel, metric='euclidean')
    score = alignment_score(wp)
    
    # Plot
    if is_plot:
        wp_s = np.asarray(wp) * HOP_SIZE / SR
        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(111)
        librosa.display.specshow(D, x_axis='time', y_axis='time',
                                 cmap='gray_r', hop_length=HOP_SIZE)
        imax = ax.imshow(D, cmap=plt.get_cmap('gray_r'),
                         origin='lower', interpolation='nearest', aspect='auto')
        ax.plot(wp_s[:, 1], wp_s[:, 0], marker='o', color='r')
        plt.title('Warping Path on Acc. Cost Matrix $D$')
        plt.colorbar()
        
    # Return reconstructed signal and score
    return reconstruct_signal(orig_x, purc_x, wp), score

In [None]:
# Compute alignment for all the dataset, creating new audio files and storing the alignment scores
out = {"File": [], "score": []}
for i, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
    file_id = row["File"]
    
    # Do alignment
    y, score = compute_alignment(file_id)
    
    # Save wav
    librosa.output.write_wav(os.path.join(OUT_DIR, file_id + ".wav"), np.asarray(y), sr=SR)
    
    # Save score
    out["File"].append(file_id)
    out["score"].append(score)

HBox(children=(IntProgress(value=0, max=912), HTML(value='')))



In [None]:
IPython.display.Audio(data=y, rate=SR)

In [79]:
out_df = pd.DataFrame(out)
out_df.to_csv("aligned_scores_new.tsv", sep=",", index=None)

In [17]:
out = {"File": [], "score": []}
sub_df = pd.read_csv("aligned_scores.tsv")
sub_df = sub_df[sub_df["score"] < 0.8]
for i, row in tqdm(sub_df.iterrows(), total=len(sub_df)):
    file_id = row["File"]
    file_id = "0843_omgalmighty"
    
    # Do alignment
    y, score = compute_alignment(file_id)
    print(file_id, score)
    
    # Save wav
    librosa.output.write_wav(os.path.join(OUT_DIR, file_id + ".wav"), np.asarray(y), sr=SR)
    
    # Save score
    out["File"].append(file_id)
    out["score"].append(score)
    break

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))



0843_omgalmighty 0.9871056873129174


In [8]:
pd.DataFrame(out)

Unnamed: 0,File,score
0,0155_lapdance,0.628448
1,0161_limelight,0.979507
2,0194_nomoretears,0.983505
3,0218_policyoftruth,0.77479
4,0277_thashiznit,0.956962
5,0470_ours,0.653731
6,0541_youdroppedabombonme,0.621284
7,0548_2getherextended,0.968944
8,0549_515,0.980725
9,0562_alreadygone,0.999672
