# Dataset exploration

---

# Imports

Data exploration will be done using the **librosa** library for audio, and the **jams** library for annotations.

In [None]:
import IPython
import librosa
import librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import jams
import os

# Dataset size

In [None]:
sizes = dict()

root = '../data/raw/'

for dir in os.listdir(root):
    size = 0
    if dir != '.DS_Store':
        for f in os.listdir(root+dir):
            size += os.path.getsize(root+dir+'/'+f)

        sizes[dir] = np.round(size / 1e6, 2)

for key, val in sizes.items():
    print(f"Folder {key} is {val} MB")

# Annotation files: *.jams

## Loading *.jams files

Annotations are saved in *\*.jams* files. See [here](http://marl.smusic.nyu.edu/papers/humphrey_jams_ismir2014.pdf) for documentation/paper and [here](https://github.com/marl/jams) for the repo.

Here, we load all frequencies and midi notes played during a song, with their associated string and save the data in two dataframes.

In [None]:
# get list of file names in folder
def load_file_names(dir: str = '../data/raw/annotation/') -> list:
    # create empty list
    filenames = []

    # go through directory and append path+file name to list
    for f in os.listdir(dir):
        filenames.append(dir+f)
    return filenames


# loading jams file for song
def load_jams_file(f: str or list) -> jams.core.JAMS:
    # return jams structure
    return jams.load(f)


# E2–A2–D3–G3–B3–E4
stringMap = {0: 'E', 1: 'A', 2: 'D', 3: 'G', 4: 'H', 5: 'e'}

# accessing single pickups:
def load_time_freq_data(j: jams.JAMS, namespace: str = 'pitch_contour') -> pd.DataFrame:
    # create empty dataframe
    df = pd.DataFrame()

    # load all played frequencies and timings for all 6 strings
    for i in range(6):
        data = pd.DataFrame(j.annotations[namespace][i])
        if data.columns.size > 0:
            freqs = pd.json_normalize(data['value'])
            strings = pd.Series([i] * freqs.shape[0], name='string')
            st_name = pd.Series([stringMap[i] for i in strings], name='string_name')
            temp = pd.concat([data.iloc[:, 0], strings, st_name, freqs.iloc[:, -1]], axis=1)
            df = pd.concat([df, temp], axis=0)

    df = df.sort_values(by='time')
    df = df.reset_index(drop=True)

    return df


def load_time_midi_data(j: jams.JAMS, namespace: str = 'note_midi') -> pd.DataFrame:
    # create empty dataframe
    df = pd.DataFrame()

    # load all played midi notes and timings for all 6 strings
    for i in range(6):
        data = pd.DataFrame(j.annotations[namespace][i])
        if data.columns.size > 0:
            strings = pd.Series([i] * data.shape[0], name='string')
            st_name = pd.Series([stringMap[i] for i in strings], name='string_name')
            temp = pd.concat([data.iloc[:, :-2], strings, st_name, data.iloc[:, -2]], axis=1)
            df = pd.concat([df, temp], axis=0)
    
    df['value'] = np.round(df['value'], 0)
    df = df.sort_values(by='time')
    df = df.reset_index(drop=True)
        
    return df

In [None]:
files = sorted(load_file_names())
n_file = 1
jams_file = load_jams_file(files[n_file])
song_df = load_time_freq_data(jams_file)
midi_df = load_time_midi_data(jams_file)

Frequencies are more often recorded than midi notes as evident by the dataframe shapes. This is probably because midi notes are recorded only at the onset of a note.

In [None]:
song_df.shape, midi_df.shape

In [None]:
print(f"Frequencies over time {song_df.shape}")
display(song_df.head(10))
print(f"Midi notes over time {midi_df.shape}")
display(midi_df.head(10))

**What other data are in the \*.jams files?**

In [None]:
import json
import pprint

data = json.load(open(files[n_file], 'rb'))
pprint.pprint(data, depth=4)

Available namespaces in JAMS

In [None]:
jams.list_namespaces()

**How many annotations do we have? How many are compositions (backing tracks) and how many are solos?**

In [None]:
n_annot = len(files)
n_solo = 0
n_comp = 0
n_c = 0
for f in files:
    if 'solo' in f: n_solo += 1
    elif 'comp' in f: n_comp += 1
    if '-C_' in f: n_c +=1

print(f"There are {n_annot} annotation files, split into {n_solo} solos and {n_comp} backing tracks. They are distributed over 5 different genres:")
print("Bossa Nova, Funk, Jazz, Rock, Singer Songwriter")
print(f"{n_c/2} songs are in C")

**Visualizing annotation data**

In [None]:
fig, _ = plt.subplots(figsize=(12, 8))

# displaying frequencies played on high e string over time
sns.scatterplot(data=song_df, x='time', y='frequency', hue='string');

In [None]:
fig, _ = plt.subplots(figsize=(12, 8))

# displaying frequencies played on high e string over time
sns.scatterplot(data=midi_df, x='time', y='value', hue='string');

## Audio files

**Importing and visualizing corresponding audio data**

For further information see [here](https://www.kdnuggets.com/2020/02/audio-data-analysis-deep-learning-python-part-1.html)

In [None]:
file = '../data/raw/audio_hex-pickup_debleeded/' + files[n_file].split('/')[-1].split('.')[0] + '_hex_cln.wav'

In [None]:
# listening to the file
IPython.display.Audio(file)

In [None]:
# storing audio file as floating point time series (x) and sample rate (sr)
# setting sr to 'None' preserves native sampling rate of the file
x, sr = librosa.load(path=file, sr=None)

print(f"There are {x.shape[0]} points in the audio file with a sample rate of {sr/1000:.2f} kHz.")

In [None]:
# shape of the waveform in the time domain
fig, ax = plt.subplots(figsize=(14, 5))
librosa.display.waveshow(x, sr=sr);

In [None]:
# Perform short-term fourier-transformation (stft) on x
# This is so we know the amplitude of any given frequency
X = librosa.stft(x)

# Convert amplitude to sound pressure level in decibel (dB)
XdB = librosa.amplitude_to_db(abs(X))

# Plot the resulting spectrogram (Frequency vs. Time, colorcode: dB)
# using specshow with y_axis='log', signals happening in the midrange are better visible
fig, ax = plt.subplots(figsize=(14, 5))
img = librosa.display.specshow(XdB, sr=sr, x_axis='time', y_axis='log')
fig.colorbar(img, ax=ax, format="%+2.f dB");

In [None]:
# ConstantQ transformation is also possible and sometimes better
# it plots amplitude vs log(freq)
X = np.abs(librosa.cqt(x, sr=sr, hop_length=512, n_bins=192, bins_per_octave=24))

# Convert amplitude to sound pressure level in decibel (dB)
XdB = librosa.amplitude_to_db(X, ref=np.max)

# Plot the resulting spectrogram (Frequency vs. Time, colorcode: dB)
fig, ax = plt.subplots(figsize=(14, 5))
img = librosa.display.specshow(XdB, sr=sr, x_axis='time', y_axis='hz', ax=ax)
fig.colorbar(img, ax=ax, format="%+2.f dB");