# Data Processing for PitchShiftNN
How to process training and testing data for PitchShiftNN.

## Global variables/Imports
Run these cells before running either of the following sections.

In [2]:
%load_ext autoreload
%autoreload 1

import os
import csv

import scipy.io as sio
from scipy.io import wavfile
from scipy.io.wavfile import write

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

%aimport VPM
from VPM import *
%aimport Utils
from Utils import *

In [3]:
# Constants that should not change without the dataset being changed
n_pitches = 16
n_vowels = 12
n_people = 3

# These dictionaries are more for reference than anything
label_to_vowel = { 0: "bed",  1: "bird",   2: "boat",  3: "book", 
                   4: "cat",  5: "dog",    6: "feet",  7: "law",  
                   8: "moo",  9: "nut",   10: "pig",  11: "say" }

vowel_to_label = { "bed": 0,  "bird": 1,  "boat":  2, "book":  3,
                   "cat": 4,  "dog":  5,  "feet":  6, "law":   7,
                   "moo": 8,  "nut":  9,  "pig":  10, "say":  11}

noteidx_to_pitch = {  0: "A2",   1: "Bb2",  2: "B2",   3: "C3",
                      4: "Db3",  5: "D3",   6: "Eb3",  7: "E3", 
                      8: "F3",   9: "Gb3", 10: "G3",  11: "Ab3",
                     12: "A3",  13: "Bb3", 14: "B3",  15: "C4" }

### Getting data references
Read the reference csv to relevant data structure.

`data_ref_list` is the list of filenames in the dataset in a 3d array format.
A specific file is accessed with `data_ref_list[vowel_idx][pitch_idx][person_idx]`.

`flat_data_ref_list` is the list of filenames in the dataset as a 1d array. To access a specific file, use `flat_data_ref_list[flat_ref_idx(vowel, pitch, person)]`

In [4]:
# e.g. data_list[vowel_to_label["dog"]][5][1]
data_ref_list = create_data_ref_list(os.path.join("Data", 'dataset_files.csv'),
                                     n_pitches, n_vowels, n_people)

# e.g. flat_data_ref_list[flat_ref_idx(3, 1, 2)]
flat_data_ref_list = flatten_3d_array(data_ref_list, 
                                      n_vowels, n_pitches, n_people)

The following are the accessor functions used to compute indices from flat to 3d and vice versa.

`flat_ref_idx` returns a `flat_ref_idx`, given a `(vowel, pitch, person)`, while `nd_ref_idx` returns `vowel, pitch, person`, given a `flat_ref_idx`.

In [5]:
# Returns a flat_ref_idx, given a vowel, pitch, person
flat_ref_idx = lambda vowel, pitch, person: flat_3d_array_idx(
    vowel, pitch, person, n_vowels, n_pitches, n_people)

# Returns vowel, pitch, person, given a flat_ref_idx
nd_ref_idx = lambda idx: nd_array_idx(idx, n_vowels, n_pitches, n_people)

In [6]:
print("Data ref list ({}):".format(len(flat_data_ref_list)), 
      flat_data_ref_list)

Data ref list (576): ['0_0-bed_0-A2.wav', '2_0-bed_0-A2.wav', '3_0-bed_0-A2.wav', '0_0-bed_1-Bb2.wav', '2_0-bed_1-Bb2.wav', '3_0-bed_1-Bb2.wav', '0_0-bed_2-B2.wav', '2_0-bed_2-B2.wav', '3_0-bed_2-B2.wav', '0_0-bed_3-C3.wav', '2_0-bed_3-C3.wav', '3_0-bed_3-C3.wav', '0_0-bed_4-Db3.wav', '2_0-bed_4-Db3.wav', '3_0-bed_4-Db3.wav', '0_0-bed_5-D3.wav', '2_0-bed_5-D3.wav', '3_0-bed_5-D3.wav', '0_0-bed_6-Eb3.wav', '2_0-bed_6-Eb3.wav', '3_0-bed_6-Eb3.wav', '0_0-bed_7-E3.wav', '2_0-bed_7-E3.wav', '3_0-bed_7-E3.wav', '0_0-bed_8-F3.wav', '2_0-bed_8-F3.wav', '3_0-bed_8-F3.wav', '0_0-bed_9-Gb3.wav', '2_0-bed_9-Gb3.wav', '3_0-bed_9-Gb3.wav', '0_0-bed_10-G3.wav', '2_0-bed_10-G3.wav', '3_0-bed_10-G3.wav', '0_0-bed_11-Ab3.wav', '2_0-bed_11-Ab3.wav', '3_0-bed_11-Ab3.wav', '0_0-bed_12-A3.wav', '2_0-bed_12-A3.wav', '3_0-bed_12-A3.wav', '0_0-bed_13-Bb3.wav', '2_0-bed_13-Bb3.wav', '3_0-bed_13-Bb3.wav', '0_0-bed_14-B3.wav', '2_0-bed_14-B3.wav', '3_0-bed_14-B3.wav', '0_0-bed_15-C4.wav', '2_0-bed_15-C4.wav', '3_

### Data-label Pitch Index pairs
Generate the data-label pitch index pairs. This is an array where each element is a 3-tuple of `[shift_amt, input_pitch_idx, label_pitch_iIdx]`.


In [7]:
data_label_pairs, data_label_pairs_dict = create_data_label_pairs(n_pitches)

In [8]:
print("Total data-label pairs ({}):".format(len(data_label_pairs)), 
      data_label_pairs)

Total data-label pairs (256): [[0, 0, 0], [0, 1, 1], [0, 2, 2], [0, 3, 3], [0, 4, 4], [0, 5, 5], [0, 6, 6], [0, 7, 7], [0, 8, 8], [0, 9, 9], [0, 10, 10], [0, 11, 11], [0, 12, 12], [0, 13, 13], [0, 14, 14], [0, 15, 15], [1, 0, 1], [1, 1, 2], [1, 2, 3], [1, 3, 4], [1, 4, 5], [1, 5, 6], [1, 6, 7], [1, 7, 8], [1, 8, 9], [1, 9, 10], [1, 10, 11], [1, 11, 12], [1, 12, 13], [1, 13, 14], [1, 14, 15], [2, 0, 2], [2, 1, 3], [2, 2, 4], [2, 3, 5], [2, 4, 6], [2, 5, 7], [2, 6, 8], [2, 7, 9], [2, 8, 10], [2, 9, 11], [2, 10, 12], [2, 11, 13], [2, 12, 14], [2, 13, 15], [3, 0, 3], [3, 1, 4], [3, 2, 5], [3, 3, 6], [3, 4, 7], [3, 5, 8], [3, 6, 9], [3, 7, 10], [3, 8, 11], [3, 9, 12], [3, 10, 13], [3, 11, 14], [3, 12, 15], [4, 0, 4], [4, 1, 5], [4, 2, 6], [4, 3, 7], [4, 4, 8], [4, 5, 9], [4, 6, 10], [4, 7, 11], [4, 8, 12], [4, 9, 13], [4, 10, 14], [4, 11, 15], [5, 0, 5], [5, 1, 6], [5, 2, 7], [5, 3, 8], [5, 4, 9], [5, 5, 10], [5, 6, 11], [5, 7, 12], [5, 8, 13], [5, 9, 14], [5, 10, 15], [6, 0, 6], [6, 1, 7],

### Get All .wav Data and STFT Data

Get the wav file data into a single matrix, where each element `all_wav_data[idx]` is the wavfile content of the file at `flat_data_ref_list[idx]`.

In addition, `all_spectrograms[idx]` is the corresponding STFT for that file. 

To retrieve the 3d indices of a specific index, use `vowel, pitch, person = nd_ref_idx(idx)`.

In [9]:
all_wav_data = load_wav_files(os.path.join("Data", "dataset"), 
                              flat_data_ref_list)
all_spectrograms = np.array([ stft(waveform, plot=False) for waveform in all_wav_data ])

In [10]:
print("All wav data length: {}\nTrack length: {}".format(
      all_wav_data.shape, all_wav_data[0].shape))
print("All spectrograms shape: {}\n".format(all_spectrograms.shape))

All wav data length: (576,)
Track length: (29400,)
All spectrograms shape: (576, 513, 58)



## Create the training/testing datasets

### Create `X_train_base, X_val_base, Y_train, Y_val`

To generate data for each `shift_amt` in `0,...,15`, one should:

(1) Change the `shift_amt` parameter in the next cell, and

(2) Rerun the notebook from this step, until the end.

In [13]:
# EDIT THE SHIFT AMOUNT PARAMETER HERE

shift_amt = 0
pairs = data_label_pairs_dict[shift_amt]

# X_train_base, Y_train: (_,513), (_,513)
# X_val_base, Y_val:     (_,513), (_,513)

X_train_base = []; X_val_base = []; Y_train = []; Y_val = []

for vow_idx in range(n_vowels):
    for pit_idx in range(n_pitches):
        
        # If the pair is valid, then proceed.
        if [shift_amt, pit_idx, pit_idx + shift_amt] in pairs:
        
            # Choose the person for this pitch/vowel to be used as test data.
            test_pid = int(np.random.rand() * 3)

            for pid_idx in range(n_people):
                wav_idx = flat_ref_idx(vow_idx, pit_idx, pid_idx)
                wav_idx_shifted = flat_ref_idx(vow_idx, pit_idx + shift_amt, pid_idx)

                if (pid_idx != test_pid):
                    X_train_base.extend(all_spectrograms[wav_idx].T)
                    Y_train.extend(all_spectrograms[wav_idx_shifted].T)
                else:
                    X_val_base.extend(all_spectrograms[wav_idx].T)
                    Y_val.extend(all_spectrograms[wav_idx_shifted].T)

X_train_base = np.array(X_train_base); Y_train = np.array(Y_train); X_val_base = np.array(X_val_base); Y_val = np.array(Y_val); 
                    
print(X_train_base.shape)
print(Y_train.shape)
print(X_val_base.shape)
print(Y_val.shape)

(22272, 513)
(22272, 513)
(11136, 513)
(11136, 513)


### Pitch shifted `X_train, X_val` using `simple_fft_pitch_shift` (manual)

In [14]:
X_train_shifted = np.array([ simple_fft_pitch_shift(x, shift_amt) for x in X_train_base ])
X_val_shifted = np.array([ simple_fft_pitch_shift(x, shift_amt) for x in X_val_base ])

In [15]:
print(X_train_shifted.shape)
print(X_val_shifted.shape)

(22272, 513)
(11136, 513)


### Type 1 or Type 2 dataset (choose ONE)

Here, one can choose to generate the `type 1` or `type 2` datasets.

The modifications are incurred on the `X`s and not the `Y`s.

Please choose **ONLY ONE** type of dataset to generate.

### Type 1 dataset (not recommended)

`(pitch_shifted, shift_amt) -> (target)`

In [158]:
# X_train, X_val: (_,514), (_,514)

shift_amt_col = np.array([ [x * shift_amt] for x in np.ones((X_train_shifted.shape[0],), dtype=int) ])
X_train = np.hstack((X_train_shifted, shift_amt_col))

shift_amt_col = np.array([ [x * shift_amt] for x in np.ones((X_val_shifted.shape[0],), dtype=int) ])
X_val = np.hstack((X_val_shifted, shift_amt_col))

print(X_train.shape)
print(X_val.shape)

(19488, 514)
(9744, 514)


### Type 2 dataset (recommended)

`(original, pitch_shifted) -> (target)`

In [16]:
# X_train, X_val: (_,1026 = 513*2), (_,1026 = 513*2)

X_train = np.hstack((X_train_base,X_train_shifted))
X_val = np.hstack((X_val_base,X_val_shifted))

print(X_train.shape)
print(X_val.shape)

(22272, 1026)
(11136, 1026)


### Change all complex values into their norms

We will be training on real numbers, not complex numbers.

In [17]:
X_train = np.absolute(X_train)
X_val = np.absolute(X_val)
Y_train = np.absolute(Y_train)
Y_val = np.absolute(Y_val)