In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import os, glob, json

In [None]:
# Select dataset
dataset_name = 'MAESTRO' # 'SMD', 'MAESTRO'
FLOAT_DTYPE = np.float32 # np.float64
AMOUNT, root_path, dataset_train_path, dataset_val_path, dataset_test_path = '','','','',''

if dataset_name == 'MAESTRO':
    AMOUNT = '' # select from '' (entire dataset) | '-medium'| '-small' 
    root_path = '/media/datadisk/home/22828187/zhanh/vae/ablation_study/Kim2023_model/converted_dataset/maestro-v3'
    dataset_train_path = f'{root_path}/train'
    dataset_val_path = f'{root_path}/validation'
    dataset_test_path = f'{root_path}/test'

# elif dataset_name == 'GiantMIDIPiano':
#     root_path = '/home/tikim/dataset/giantmidi'
#     dataset_train_path = f'{root_path}/dataset/train'
#     dataset_val_path = f'{root_path}/dataset/validation'
#     dataset_test_path = f'{root_path}/dataset/test'

# elif dataset_name == 'chien2021': 
#     # Also known as Piano-e-competition dataset, some MIDI duplicate with Maestro dataset
#     root_path = '/home/tikim/dataset/chien2021'
#     dataset_train_path = f'{root_path}/dataset/train'
#     dataset_test_path = f'{root_path}/dataset/test'

elif dataset_name == 'SMD':
    # Set up SMD paths (this dataset is a test set only)
    root_path = "/media/datadisk/home/22828187/zhanh/vae/ablation_study/Kim2023_model/converted_dataset/SMD"
    dataset_test_path = f'{root_path}/test


# Initialize data variables
extension = 'csv'
note_num_min, note_num_max = 0, 127
velocity_min, velocity_max = 0, 127
dataset_entire_train = None # Reset
dataset_entire_test = None # Reset

# Load training data (if applicable)
if dataset_name in ['MAESTRO', 'GiantMIDIPiano', 'chien2021']:
    csv_files_train = []
    if os.path.exists(dataset_train_path):
        os.chdir(dataset_train_path)
        train_csv_filenames = glob.glob(f'*.{extension}')
        for filename in train_csv_filenames:
            df = pd.read_csv(filename, index_col=None, header=0)
            if len(df) == 0:
                print(f'Dataframe is empty! {filename} skipping...')
                continue
            csv_files_train.append(df)
        if csv_files_train:
            dataset_entire_train = pd.concat(csv_files_train, axis=0, ignore_index=True)
        else:
            print("No CSV files found in the training dataset directory.")
    else:
        print("Training directory does not exist.")


# Load testing data
csv_files_test = []
if os.path.exists(dataset_test_path):
    os.chdir(dataset_test_path)
    test_csv_filenames = glob.glob(f'*.{extension}')
    for filename in test_csv_filenames:
        df = pd.read_csv(filename, index_col=None, header=0)
        if len(df) == 0:
            print(f"Dataframe is empty! {filename} skipping...")
            continue
        csv_files_test.append(df)
    if csv_files_test:
        dataset_entire_test = pd.concat(csv_files_test, axis=0, ignore_index=True)
    else:
        print("No CSV files found in the testing dataset directory.")
else:
    print("Testing directory does not exist.")


# Load validation data (if applicable)
if dataset_name in ['MAESTRO', 'GiantMIDIPiano']:
    csv_files_val = []
    if os.path.exists(dataset_val_path):
        os.chdir(dataset_val_path)
        val_csv_filenames = glob.glob(f'*.{extension}')
        for filename in val_csv_filenames:
            df = pd.read_csv(filename, index_col=None, header=0)
            if len(df) == 0:
                print(f'Dataframe is empty! {filename} skipping...')
                continue
            csv_files_val.append(df)
        if csv_files_val:
            dataset_entire_val = pd.concat(csv_files_val, axis=0, ignore_index=True)
        else:
            print("No CSV files found in the validation dataset directory.")
    else:
        print("Validation directory does not exist.")


# Perform processing on train data
if dataset_name != "SMD": 
    columns_train = ['time_diff', 'note_num', 'length', 'note_num_diff', 'low_octave']
    columns_label = ['velocity']
    dataset_entire_train = np.array(dataset_entire_train[columns_train], dtype=FLOAT_DTYPE)
    time_diff_min = np.min(dataset_entire_train[:, 0])
    time_diff_max = np.max(dataset_entire_train[:, 0])
    length_min = np.min(dataset_entire_train[:, 2])
    length_max = np.max(dataset_entire_train[:, 2])
    note_num_diff_min = np.min(dataset_entire_train[:, 3])
    note_num_diff_max = np.max(dataset_entire_train[:, 3])
    print(f"Train time_diff range: {time_diff_min} to {time_diff_max}")
    print(f"Length range: {length_min} to {length_max}")
    print(f"Note num diff range: {note_num_diff_min} to {note_num_diff_max}")
# Perform processing on test data
else: 
    columns_test = ['time_diff', 'note_num', 'length', 'note_num_diff', 'low_octave']
    columns_label = ['velocity']
    dataset_entire_test = np.array(dataset_entire_test[columns_test], dtype=FLOAT_DTYPE)
    time_diff_min = np.min(dataset_entire_test[:, 0])
    time_diff_max = np.max(dataset_entire_test[:, 0])
    length_min = np.min(dataset_entire_test[:, 2])
    length_max = np.max(dataset_entire_test[:, 2])
    note_num_diff_min = np.min(dataset_entire_test[:, 3])
    note_num_diff_max = np.max(dataset_entire_test[:, 3])
    print(f"Test time_diff range: {time_diff_min} to {time_diff_max}")
    print(f"Length range: {length_min} to {length_max}")
    print(f"Note num diff range: {note_num_diff_min} to {note_num_diff_max}")

Train time_diff range: 0.0 to 37568.0
Length range: 1.0 to 46778.0
Note num diff range: -81.0 to 85.0


In [3]:
def divide_list(l, n, overlapping_window=0):
    for i in range(0, len(l) - n + 1, n - overlapping_window):
        yield l[i:i + n]
    if len(l) % n != 0 and len(l) % n < n:
        yield l[-(len(l) % n):]
                
SAMPLE_LENGTH = 4
FEATURE_NUM = 5
def pad_data(data, feature_num):
    # print(f'Length of data: {len(data)}')
    if len(data[-1]) != SAMPLE_LENGTH:
        # print(f'Length of last array: {len(data[-1])}')
        last_array = data.pop()
        # print(f'before padding: {last_array}')
        zero_array = np.zeros((SAMPLE_LENGTH - len(last_array), feature_num), dtype=FLOAT_DTYPE)
        last_array = np.concatenate((last_array, zero_array))
        # print(f'after padding: {last_array}')
        data.append(last_array)
        # print(f'Length of last array (after padding): {len(data[-1])}')
    return data

def make_dataset(csv_files, columns_train, columns_label):
    dataset_entire_input = np.empty((0, SAMPLE_LENGTH, FEATURE_NUM), dtype=FLOAT_DTYPE)
    dataset_entire_label = np.empty((0, SAMPLE_LENGTH, 1), dtype=FLOAT_DTYPE)
    
    for df in csv_files:
        data_input_raw = np.array(df[columns_train], dtype=FLOAT_DTYPE)
        data_label_raw = np.array(df[columns_label], dtype=FLOAT_DTYPE)
        
        # normalize the time difference
        data_input_raw[:, 0] = (data_input_raw[:, 0] - time_diff_min) / (time_diff_max - time_diff_min)
        # normalize the note number
        data_input_raw[:, 1] = (data_input_raw[:, 1] - note_num_min) / (note_num_max - note_num_min)
        # normalize the length
        data_input_raw[:, 2] = (data_input_raw[:, 2] - length_min) / (length_max - length_min)
        # normalize the note number difference
        data_input_raw[:, 3] = (data_input_raw[:, 3] - note_num_diff_min) / (note_num_diff_max - note_num_diff_min)
        # you don't have to normalize the low octave
        # normalize the time
        # data_input_raw[:, 5] = (data_input_raw[:, 5] - time_min) / (time_max - time_min)
        
        # normalize the velocity
        data_label_raw[:, 0] = (data_label_raw[:, 0] - velocity_min) / (velocity_max - velocity_min)

        data_input_raw2 = list(divide_list(data_input_raw, SAMPLE_LENGTH, SAMPLE_LENGTH - 1))
        data_input_raw2 = pad_data(data_input_raw2, FEATURE_NUM)
        data_input = np.array(data_input_raw2, dtype=FLOAT_DTYPE)
        dataset_entire_input = np.vstack((dataset_entire_input, data_input))

        data_label_raw2 = list(divide_list(data_label_raw, SAMPLE_LENGTH, SAMPLE_LENGTH - 1))
        data_label_raw2 = pad_data(data_label_raw2, 1)
        data_label = np.array(data_label_raw2, dtype=FLOAT_DTYPE)
        dataset_entire_label = np.vstack((dataset_entire_label, data_label))
    
    return dataset_entire_input, dataset_entire_label

In [None]:
if (dataset_name == 'MAESTRO' or dataset_name == 'GiantMIDIPiano'):
    dataset_train_input, dataset_train_label = make_dataset(csv_files_train, columns_train, columns_label)
    dataset_val_input, dataset_val_label = make_dataset(csv_files_val, columns_train, columns_label)
    dataset_test_input, dataset_test_label = make_dataset(csv_files_test, columns_train, columns_label)

# elif (dataset_name == 'chien2021'):
#     dataset_train_input, dataset_train_label = make_dataset(csv_files_train, columns_train, columns_label)
#     dataset_test_input, dataset_test_label = make_dataset(csv_files_test, columns_train, columns_label)

elif (dataset_name == 'SMD'):
    dataset_test_input, dataset_test_label = make_dataset(csv_files_test, columns_test, columns_label)

In [5]:
dataset_filename_short = ''
dataset_filename = ''


if dataset_name == 'MAESTRO':
    if(FLOAT_DTYPE == np.float32):
        dataset_filename_short = f'dataset32-{dataset_name}-len{SAMPLE_LENGTH}{AMOUNT}'
    elif (FLOAT_DTYPE == np.float64):
        dataset_filename_short = f'dataset64-{dataset_name}-len{SAMPLE_LENGTH}{AMOUNT}'
    else:
        dataset_filename_short = 'dataset'

elif dataset_name in ['GiantMIDIPiano', 'chien2021', 'SMD']:
    if (FLOAT_DTYPE == np.float32):
        dataset_filename_short = f'dataset32-{dataset_name}-len{SAMPLE_LENGTH}'
    elif (FLOAT_DTYPE == np.float64):
        dataset_filename_short = f'dataset64-{dataset_name}-len{SAMPLE_LENGTH}'
    else:
        dataset_filename_short = 'dataset'


dataset_filename = f'{dataset_filename_short}.pkl'


if (dataset_name == 'MAESTRO' or dataset_name == 'GiantMIDIPiano'):
    pkl.dump({'dataset_train_input': dataset_train_input, 'dataset_train_label': dataset_train_label,
            'dataset_val_input': dataset_val_input, 'dataset_val_label': dataset_val_label,
            'dataset_test_input': dataset_test_input, 'dataset_test_label': dataset_test_label,
            'train_time_diff_min': time_diff_min, 'train_time_diff_max': time_diff_max, 
            'note_num_min': note_num_min, 'note_num_max': note_num_max, 
            'note_num_diff_min': note_num_diff_min, 'note_num_diff_max': note_num_diff_max,
            'length_min': length_min, 'length_max': length_max,
            # 'time_min': time_min, 'time_max': time_max,
            'velocity_min': velocity_min, 'velocity_max': velocity_max}, open(dataset_filename, 'wb'))
    
elif (dataset_name == 'chien2021'):
    pkl.dump({'dataset_train_input': dataset_train_input, 'dataset_train_label': dataset_train_label,
            'dataset_test_input': dataset_test_input, 'dataset_test_label': dataset_test_label,
            'train_time_diff_min': time_diff_min, 'train_time_diff_max': time_diff_max,
            'note_num_min': note_num_min, 'note_num_max': note_num_max,
            'note_num_diff_min': note_num_diff_min, 'note_num_diff_max': note_num_diff_max,
            'length_min': length_min, 'length_max': length_max,
            # 'time_min': time_min, 'time_max': time_max,
            'velocity_min': velocity_min, 'velocity_max': velocity_max}, open(dataset_filename, 'wb'))
    
elif (dataset_name == 'SMD'):
    pkl.dump({'dataset_test_input': dataset_test_input, 'dataset_test_label': dataset_test_label,
            'train_time_diff_min': time_diff_min, 'train_time_diff_max': time_diff_max,
            'note_num_min': note_num_min, 'note_num_max': note_num_max,
            'note_num_diff_min': note_num_diff_min, 'note_num_diff_max': note_num_diff_max,
            'length_min': length_min, 'length_max': length_max,
            # 'time_min': time_min, 'time_max': time_max,
            'velocity_min': velocity_min, 'velocity_max': velocity_max}, open(dataset_filename, 'wb'))


# save metadata to json. Convert float32 to float
metadata = {'train_time_diff_min': time_diff_min, 'train_time_diff_max': time_diff_max,
            'note_num_min': note_num_min, 'note_num_max': note_num_max,
            'note_num_diff_min': note_num_diff_min, 'note_num_diff_max': note_num_diff_max,
            'length_min': length_min, 'length_max': length_max,
            # 'time_min': time_min, 'time_max': time_max,
            'velocity_min': velocity_min, 'velocity_max': velocity_max}


with open(f'{dataset_filename_short}.json', 'w') as f:
    json.dump(metadata, f, default=int)