# Data Augmentation
There are classes of chords that doesn't appear very often, so in order to balance our dataset, we will have to replicate the existing data in other pitches.
<br><br> In this notebook,there are some simple scripts, to <b>pitch shift audio tracks and their chord transcriptions.</b>
### Note:
After careful obersvation of the results of this type of data augmentation, I came to the conclusion that:
- It is very valuable, because my net can see chords that didn't see before
- By augmenting **ALL** of the data, doesn't resolve the **imbalance of the dataset**

In [1]:
import sys
# do this only once
sys.path.append('./src/audio_processing/')
sys.path.append('./src/data_processing/')
sys.path.append('./src/chord_parser/')
sys.path.append('./src/metrics/')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#python input/output and regex
import re
import os
from pathlib import Path

#chord info
import pychord as pc

#librosa
import librosa
import librosa.display

## Cqt spectrograms
- same features as we do in classification

In [3]:
nbins=192
bins_per_octave=24
hop_length=2048
sampling_rate= 22050

audiofiles_path = 'Audiofiles/The Beatles/original'
shifted_audiofiles_path = 'Audiofiles/The Beatles/'

In [None]:
# script for audio pitch shifts
for pitch_shift, nshift in [('up1', 1), ('up2', 2), ('up3', 3), ('up4', 4), ('up5', 5), ('down1', -1), ('down2', -2), ('down3', -3), ('down4', -4), ('down5', -5)]:
    
    for filename in Path(audiofiles_path).glob('**/*.wav'):

        path, track = os.path.split(filename)
        path, album = os.path.split(path)

        track_no = re.search('([0-9].).', track).group(1)

        y, sr = librosa.load(filename, sr=sampling_rate)

        y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=nshift, bins_per_octave=24)

        librosa.output.write_wav(shifted_audiofiles_path + pitch_shift + '/' + album + '_' + pitch_shift + '/' + track_no + '.'+ pitch_shift + '.wav', y_shifted, sr=sampling_rate, norm=False)

## Chord Transposer by Interval

In [5]:
target = 'Big-Dataset/The Beatles/scratch/'

In [16]:
ChordLib = {'C' : 0, 'C#': 1, 'Db' : 1, 'D' : 2, 'D#' : 3, 'Eb' : 3, 'E' : 4, 'Fb' : 4,
            'F' : 5, 'F#' : 6, 'Gb' : 6, 'G' : 7, 'G#' : 8, 'Ab' : 8, 'A' : 9, 'A#' : 10,
            'Bb' : 10, 'B' : 11, 'Cb' : 11, 'N' : 12, 'X' : 13}
Harmonic_Equivalents = {'A#' : 'Bb', 'C#' : 'Db', 'D#' : 'Eb', 'F#' : 'Gb', 'G#' : 'Ab'}
Semitone_List = ['C', 'C#', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'G#', 'A', 'Bb', 'B']

In [17]:
### get all chordlab files
Chordlab = {'The Beatles': {}}
for filename in Path('Big-Dataset/The Beatles').glob('**/scratch/original/*/*.lab'):
    
    path, track = os.path.split(filename)
    path, album = os.path.split(path)
    track_no = re.search('([0-9].)_-_',track).group(1)
    
    if (album not in Chordlab['The Beatles']): 
        Chordlab['The Beatles'][album] = {}
        
    Chordlab['The Beatles'][album][track_no] = pd.read_csv(filename, names=['Starts', 'Ends', 'Chord'], sep=' ', header=None)
    
    for pitch_shift, nshift in [('up1', 1), ('up2', 2), ('up3', 3), ('up4', 4), ('up5', 5), ('down1', 1), ('down2', 2), ('down3', 3), ('down4', 4), ('down5', 5)]:
        
        if pitch_shift.find('down'):
            chord_list = []
            for _, starts, ends, chord in Chordlab['The Beatles'][album][track_no].itertuples():
                rest = ''
                if chord != 'N':
                    if chord.find(':') != -1:
                        chord, rest = chord.split(':')
                        rest = ':' + rest
                    if chord.find('/') != -1:
                        chord, rest = chord.split('/')
                        rest = '/' + rest
                    chord_list.append([starts, ends, Semitone_List[ChordLib[chord] - nshift] + rest])
                else:
                    chord_list.append([starts, ends, chord])

            df = pd.DataFrame(chord_list, columns = ['Starts', 'Ends', 'Chord'])
            newfilename = target + pitch_shift + '/' + album + '_' + pitch_shift + '/' + track
            df.to_csv(newfilename, sep = ' ', index=False, header=False)
        else:
            chord_list = []
            for _, starts, ends, chord in Chordlab['The Beatles'][album][track_no].itertuples():
                rest = ''
                if chord != 'N':
                    if chord.find(':') != -1:
                        chord, rest = chord.split(':')
                        rest = ':' + rest
                    if chord.find('/') != -1:
                        chord, rest = chord.split('/')
                        rest = '/' + rest
                    chord_list.append([starts, ends, Semitone_List[(ChordLib[chord] + nshift)%12] + rest])
                else:
                    chord_list.append([starts, ends, chord])

            df = pd.DataFrame(chord_list, columns = ['Starts', 'Ends', 'Chord'])
            newfilename = target + pitch_shift + '/' + album + '_' + pitch_shift + '/' + track
            df.to_csv(newfilename, sep = ' ', index=False, header=False)