In [55]:
import os 

# Run global_variables, synthesize_functions and mlp_model
%run ~/violin-renderer/src/global_variables.ipynb
# %run ~/violin-renderer/src/models/lstm/lstm_model.ipynb
%run ~/violin-renderer/src/synthesize/synthesize_functions.ipynb
%run ~/violin-renderer/src/models/lstm/data_processing.ipynb

Skip downloading as the MuseScore General soundfont is found.


In [60]:
# Create an output directory in /src, generate all results from the test dataset and synthesize all of them
# Setting up paths and output folders
SOURCE_INPUT_PATH =  HOME_PATH + '/bach-violin-dataset/dataset/source-input'
TEMPO_PATH =  HOME_PATH + '/bach-violin-dataset/dataset/tempos'
OUTPUT_PATH = HOME_PATH + '/src/lstm-output'
composer_directories = os.listdir(SOURCE_INPUT_PATH)

In [56]:
testing_X, testing_y = load_testing_data()

In [57]:
# reference: https://www.kaggle.com/code/kanncaa1/long-short-term-memory-with-pytorch
class BILSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(BILSTM, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim

        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, bidirectional=True, batch_first=True) # batch_first=True (batch_dim, seq_dim, feature_dim)

        # Output layer
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, input_data, lengths):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, input_data.size(0), self.hidden_dim, device=input_data.device).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, input_data.size(0), self.hidden_dim, device=input_data.device).requires_grad_()

        # Make the model "understand" that we're using padding
        packed_input = pack_padded_sequence(input_data, lengths.cpu().numpy(), batch_first=True, enforce_sorted=False)

        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.fc(output)
        
        return output

In [58]:
# reference: https://www.kaggle.com/code/kanncaa1/long-short-term-memory-with-pytorch
class UNILSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(UNILSTM, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim

        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True) # batch_first=True (batch_dim, seq_dim, feature_dim)

        # Output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_data, lengths):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, input_data.size(0), self.hidden_dim, device=input_data.device).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, input_data.size(0), self.hidden_dim, device=input_data.device).requires_grad_()

        # Make the model "understand" that we're using padding
        packed_input = pack_padded_sequence(input_data, lengths.cpu().numpy(), batch_first=True, enforce_sorted=False)

        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.fc(output)
        
        return output

In [59]:
def trim_sequence(input_song, true_song):
    all_source_inputs = []
    all_ground_truths = []

    for note_source, note_truth in zip(input_song, true_song):
        if note_source[1] > 64 * 24:
            break
            
        note_source = list(note_source)
        note_truth = list(note_truth)
        
        note_source[0] = scale_timings(note_source[0])
        note_source[1] = scale_timings(note_source[1])
        note_source[2] = scale_pitch(note_source[2])
        
        all_source_inputs.append(note_source)
        all_ground_truths.append(note_truth)

    return all_source_inputs, all_ground_truths

In [63]:
input_dim = 3
hidden_dim = 32
layer_dim = 2
output_dim = 2

# initialize the MLP
model = UNILSTM(input_dim, hidden_dim, layer_dim, output_dim)

model.load_state_dict(torch.load(HOME_PATH + '/src/models/lstm/one_direction_lstm.pt'))

<All keys matched successfully>

In [64]:
# Get generated output values from all songs
results = {}
ground_truth = {}

for input_path, true_path in zip(testing_X, testing_y):
    trimmed_notes = []
    trimmed_notes, trimmed_truth = trim_sequence(testing_X[input_path], testing_y[true_path])
    
    trimmed_notes = [trimmed_notes]
    trimmed_notes = torch.Tensor(trimmed_notes)
    length = [len(trimmed_notes[0])]
    length = torch.Tensor(length)

    pred = model(trimmed_notes, length)
    
    pred = pred.tolist()

    # popped_values = []
    # # remove nonsensical values
    # for index in range(len(pred)):
    #     if pred[0][index][1] < pred[0][index][0]:
    #         popped_values.append(index)

    # for index in reversed(popped_values):
    #     pred.pop(index)

    ground_truth[true_path] = trimmed_truth
    results[input_path] = pred

In [65]:
def MSE_error():
    MSE_results = []
    for generative_timings, truth in zip(results.values(), ground_truth.values()):
        predict_values = torch.Tensor(generative_timings)
        ground_truth_values = torch.Tensor(truth[0])
        MSE_value = loss(predict_values, ground_truth_values)
        MSE_results.append(MSE_value)

    return MSE_results

In [67]:
# Micro Average
def micro_MSE():
    MSE = [float(num) for num in MSE_error()]
    number_of_notes = [len(song) for song in ground_truth.values()]
    weighted_MSE = [mse_error * num_of_notes / sum(number_of_notes) for mse_error, num_of_notes in zip(MSE, number_of_notes)]
    print("Micro MSE: %.2f" % sum(weighted_MSE))
    print("Micro Root MSE: %.2f" % np.sqrt(sum(weighted_MSE)))

In [66]:
print(MSE_error())

[tensor(452.1805), tensor(2458.9905), tensor(1353.2759), tensor(1009.1328), tensor(1960.9547), tensor(1512.4719), tensor(2670.6404), tensor(844.3387), tensor(199.2810), tensor(249.4863)]


In [68]:
print(micro_MSE())

Micro MSE: 1573.78
Micro Root MSE: 39.67
None


In [13]:
# Synthesize
if not os.path.isdir(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

for notes_path, generated_results in results.items():
    name = '/' + notes_path.split('/')[-1]
    name = name.replace('.csv', '')
    generated_timings = [(note[0], note[1]) for note in generated_results[0]]
    synthesize_generated_output(notes_path, generated_timings, OUTPUT_PATH, name)

FluidSynth runtime version 2.2.5
Copyright (C) 2000-2022 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file '/home/wanninglu/violin-renderer/src/lstm-output/shunske-sato_bwv1005_mov2.wav'..
FluidSynth runtime version 2.2.5
Copyright (C) 2000-2022 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file '/home/wanninglu/violin-renderer/src/lstm-output/shunske-sato_bwv1001_mov1.wav'..
FluidSynth runtime version 2.2.5
Copyright (C) 2000-2022 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file '/home/wanninglu/violin-renderer/src/lstm-output/iris-kengen_bwv1001_mov2.wav'..
FluidSynth runtime version 2.2.5
Copyright (C) 2000-2022 Peter Hanappe and others.
Distributed under the LGPL license.
Sou