# Data Prep

The goal of this notebook is to prep the data for the alignment task.  This includes generating noisy data and extracting CENS features.

In [1]:
import numpy as np
import librosa as lb
import os
import os.path
from pathlib import Path
import multiprocessing

In [2]:
ANNOTATIONS_ROOT = Path('Chopin_Mazurkas/annotations_beat')
AUDIO_ROOT = Path('Chopin_Mazurkas/wav_22050_mono')
FEATURES_ROOT = Path('features')
train_files = Path('cfg_files/filelist.train.txt')
test_files = Path('cfg_files/filelist.test.txt')

In [3]:
if not os.path.exists(FEATURES_ROOT):
    os.mkdir(FEATURES_ROOT)

### Compute features on clean audio

First we compute features on the original audio.

In [4]:
def compute_chroma_single(infile, outfile, sr = 22050, hop_length=512):
    y, sr = lb.core.load(infile, sr = sr)
    #F = lb.feature.chroma_cens(y, sr=sr, hop_length=hop_length)
    F = lb.feature.chroma_cqt(y, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)
    return

In [5]:
def compute_chroma_batch(filelist, outdir, n_cores):
    
    # prep inputs for parallelization
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = outdir / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = (featdir / fileid).with_suffix('.npy')
            audiofile = (AUDIO_ROOT / relpath).with_suffix('.wav')
            if os.path.exists(featfile):
                print(f"Skipping {featfile}")
            else:
                inputs.append((audiofile, featfile))

    # process files in parallel
    pool = multiprocessing.Pool(processes = n_cores)
    pool.starmap(compute_chroma_single, inputs)
    
    return

In [6]:
FEATS_CLEAN_DIR = FEATURES_ROOT / 'clean'
compute_chroma_batch(train_files, FEATS_CLEAN_DIR, 24)
compute_chroma_batch(test_files, FEATS_CLEAN_DIR, 24)

Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Afanassiev-2001_pid9130-01.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Ashkenazy-1981_pid9058-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Beliavsky-2004_pid9152-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Ben-Or-1989_pid9161-12.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Biret-1990_pid9062-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Blet-2003_pid9103-07.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Block-1995_pid9064-04.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Brailowsky-1960_pid9060-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Chiu-1999_pid9048-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Clidat-1994_pid9067-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Cohen-1997_pid9105-13.npy
Skipping features/clean/Chopin_Op017No4/Chopin_Op017No4_Coop-1987_pid9159-04.npy

Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Rudanovskaya-2007_pid610242-02.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Shebanova-2002_pid9072-15.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Smith-1975_pid9054-15.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Sztompka-1959_pid9170-15.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Tanyel-1992_pid917813-03.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Tsujii-2005_pid9078-07.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Uninsky-1959_pid9061-15.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Vardi-1988_pid9173-20.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Wasowski-1980_pid9111-15.npy
Skipping features/clean/Chopin_Op024No2/Chopin_Op024No2_Zimerman-1975_pid6100021-11.npy
Skipping features/clean/Chopin_Op030No2/Chopin_Op030No2_Ashkenazy-1981_pid9058-19.npy
Skipping features/clean/Chopin_Op030No2/Chopin_Op030No2_Ashkena

### Compute features on noisy audio (AWGN)

Now we compute chroma features on the audio with additive white gaussian noise at specified SNR levels.

In [7]:
def compute_chroma_AWGN_single(infile, outfile, snr_db, sr = 22050, hop_length=512):
    y, sr = lb.core.load(infile, sr = sr)
    noise_power = np.mean(y*y) * np.power(10, -1*snr_db/10)
    noise = np.random.randn(len(y)) * np.sqrt(noise_power)
    y_noisy = y + noise
    #F = lb.feature.chroma_cens(y_noisy, sr=sr, hop_length=hop_length)
    F = lb.feature.chroma_cqt(y_noisy, sr=sr, hop_length=hop_length, norm=2)
    np.save(outfile, F)
    return

In [8]:
def compute_chroma_AWGN_batch(filelist, outdir, n_cores, snr_db):
    
    # prep inputs for parallelization
    inputs = []
    with open(filelist, 'r') as f:
        for line in f:
            relpath = line.strip()
            reldir, fileid = os.path.split(relpath)
            featdir = outdir / reldir
            featdir.mkdir(parents=True, exist_ok=True)
            featfile = featdir / fileid
            audiofile = (AUDIO_ROOT / relpath).with_suffix('.wav')
            inputs.append((audiofile, featfile, snr_db))

    # process files in parallel
    pool = multiprocessing.Pool(processes = n_cores)
    pool.starmap(compute_chroma_AWGN_single, inputs)
    
    return

In [9]:
snrs = [20, 15, 10, 5, 0, -5]
for snr in snrs:
    print(f'Computing features at {snr}dB')
    featdir = FEATURES_ROOT / f'awgn{snr}dB'
    compute_chroma_AWGN_batch(train_files, featdir, 24, snr)
    compute_chroma_AWGN_batch(test_files, featdir, 24, snr)

Computing features at 20dB
Computing features at 15dB
Computing features at 10dB
Computing features at 5dB
Computing features at 0dB
Computing features at -5dB


### Generate query list

Here we generate a file containing each pair of files to be aligned.

In [16]:
def generate_query_list(filelist, outfile):
    
    # group files by piece
    d = {}
    with open(filelist, 'r') as f:
        for line in f:
            parts = line.strip().split('/')
            assert len(parts) == 2
            piece, fileid = parts
            if piece not in d:
                d[piece] = []
            d[piece].append(fileid)
            
    # print out all pairings
    with open(outfile, 'w') as fout:
        for piece in d:
            num_recordings = len(d[piece])
            for i in range(num_recordings):
                fileid1 = d[piece][i]
                for j in range(i+1, num_recordings):
                    fileid2 = d[piece][j]
                    line = f'{piece}/{fileid1} {piece}/{fileid2}\n'
                    fout.write(line)
                    
    return

In [17]:
train_queries = 'cfg_files/query.train.list'
test_queries = 'cfg_files/query.test.list'
generate_query_list(train_files, train_queries)
generate_query_list(test_files, test_queries)