In [1]:
import torch
import os 
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
%run ~/violin-renderer/src/misc/parse.ipynb
# %run ~/violin-renderer/src/misc/randomizer.ipynb

In [2]:
HOME_DIR = os.path.expanduser("~")

In [3]:
# initialize GPU to move model/tensors onto
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

In [4]:
# load all the datasets
training_X, training_y, testing_X, testing_y = load_from_paths(HOME_DIR)

In [5]:
# this is the number of songs we have
print(len(training_X))

84


In [6]:
class LSTMMusicDataset(Dataset):
    def __init__(self, input_data, ground_truth, transform=None):
        self.input_data = input_data
        self.ground_truth = ground_truth
        self.transform = transform

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        input_sample = self.input_data[idx]
        ground_truth_sample = self.ground_truth[idx]

        if self.transform:
            input_sample = self.transform(input_sample)

        return input_sample, ground_truth_sample

In [7]:
# helper functions to scale data
#
# @param column The column to be scaled
# @returns the scaled column
def scale_data(column):
    min_val = np.min(column)
    max_val = np.max(column)
    scaled_column = (2 * (column - min_val) / (max_val - min_val)) - 1
    return scaled_column

def scale_pitch(column):
    return column / 128

In [8]:
# put them into an N (number of songs) x MAX_SEQUENCE_LENGTH x input_dim matrix

training_input = np.zeros((84, 1024, 3))
training_truth = np.zeros((84, 1024, 2))

for (i, input_notes, truth) in zip(range(84), training_X, training_y):
    input_array = np.zeros((1024, 3))
    truth_array = np.zeros((1024, 2))
    input_notes = np.array(input_notes)
    truth = np.array(truth)

    # trim down the sequence, or pad it with [0,0,0]
    if len(input_notes) > 1024:
        input_array = input_notes[:1024]
        truth_array = truth[:1024]
        
        input_array[:, 0] = scale_data(input_array[:, 0])
        input_array[:, 1] = scale_data(input_array[:, 1])
        input_array[:, 2] = scale_pitch(input_array[:, 2])
    else:
        input_array[:len(input_notes)] = input_notes
        truth_array[:len(truth)] = truth

        input_array[:len(input_notes), 0] = scale_data(input_array[:len(input_notes), 0])
        input_array[:len(input_notes), 1] = scale_data(input_array[:len(input_notes), 1])
        input_array[:len(input_notes), 2] = scale_pitch(input_array[:len(input_notes), 2])

    # just in case manipulating the original object manipulates everything
    training_input[i] = np.copy(input_array)
    training_truth[i] = np.copy(truth_array)

In [9]:
# first create the custom datasets, then create the data loaders
training_data = LSTMMusicDataset(input_data=torch.Tensor(training_input), ground_truth=torch.Tensor(training_truth))
# testing_data = MusicDataset(input_data=torch.Tensor(all_testing_input), ground_truth=torch.Tensor(all_testing_truth))

training_loader = DataLoader(training_data, batch_size=8, shuffle=True)
# testing_loader = DataLoader(testing_data, batch_size=100, shuffle=False)

In [12]:
# USING MLP DATASET METHOD ONLY, comment out to use LSTM dataset method

# first, compile all the data into one big matrix
all_input = []
all_truth = []
for (input_notes, truth) in zip(training_X, training_y):
    all_input.extend(input_notes)
    all_truth.extend(truth)

all_testing_input = []
all_testing_truth = []
for (input_notes, truth) in zip(testing_X, testing_y):
    all_testing_input.extend(input_notes)
    all_testing_truth.extend(truth)

# normalizing the input
all_input = np.array(all_input)

all_input[:, 0] = scale_data(all_input[:, 0])
all_input[:, 1] = scale_data(all_input[:, 1])
all_input[:, 2] = scale_pitch(all_input[:, 2])

training_data = LSTMMusicDataset(input_data=torch.Tensor(all_input), ground_truth=torch.Tensor(all_truth))
training_loader = DataLoader(training_data, batch_size=100, shuffle=True)

In [13]:
# reference: https://www.kaggle.com/code/kanncaa1/long-short-term-memory-with-pytorch
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True) # batch_first=True (batch_dim, seq_dim, feature_dim)

        # Output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=x.device).requires_grad_()

        print(x.shape)

        out, _ = self.lstm(x, (h0.detach(), c0.detach()))
        
        # Reshape output to (batch_size * sequence_length, hidden_size)
        out = out.contiguous().view(-1, self.hidden_dim)

        # Flattens the output 
        out = self.fc(out) 
        
        out = out.view(x.size(0), -1, out.size(1))
        
        return out

In [14]:
input_dim = 3
hidden_dim = 50
layer_dim = 1
output_dim = 2

# initialize the MLP
model = LSTM(input_dim, hidden_dim, layer_dim, output_dim)

# transfer model to GPU
model.to(device)

LSTM(
  (lstm): LSTM(3, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [15]:
# Define our loss function (mean squared error) to be used in the grad descent step
loss = nn.MSELoss()

# Performs the gradient descent steps
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)

In [16]:
# Trains the model inputted into the function.
#
# @param model The model object to be trained
# @param optimizer The optimizing equation to use to train the model
# @param input_notes The training input data
# @param truth The actual output for the corresponding input
# @param loss_module Equation for calculating the difference between generated and actual output
# @param num_epochs Number of cycles to train the model
def train_model_loop(model, optimizer, dataloader, loss_module):
    # Set model to train mode
    model.train()

    # Training loop   
    for batch, (input_notes, truth) in enumerate(dataloader):

        input_notes = input_notes.to(device)
        truth = truth.to(device)

        ## Run the model on the input data
        preds = model(input_notes)

        print(preds)

        ## Calculate loss
        loss = loss_module(preds, truth)

        ## Backpropagation (using loss value to reset weights). Reset gradients.
        optimizer.zero_grad()
        loss.backward()

        ## Update parameters
        optimizer.step()

        ## For every 50th batch, print out the current loss as well # of samples trained
        if batch % 50 == 0:
                    loss, current = loss.item(), batch * 100 + len(input_notes)
                    print(f"loss: {loss:>7f}  [{current:>5d}/{len(dataloader.dataset):>5d}]")

In [17]:
# trains the model using the dataloader
def train_model():
    epochs = 50
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_model_loop(model, optimizer, training_loader, loss)
    torch.save(model.state_dict(), 'lstm.pt')

In [18]:
train_model()

Epoch 1
-------------------------------
torch.Size([100, 3])


RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors