In [1]:
import torch
import torch.nn as nn
import torch.utils as utils
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [2]:
# midi_string = open('./source/essence_text.txt', 'r', encoding='utf-8').read()

In [3]:
# midi_string[:100]

In [4]:
# midi_string[:800].count('\n')

In [5]:
# chr_nums = list(map((lambda chr: ord(chr) - ord('\n')), midi_string))

In [6]:
# chr_nums[:50]

In [7]:
# full_dataset = torch.zeros(len(chr_nums), 129)
# full_dataset.data[range(len(chr_nums)), chr_nums] = 1
# full_dataset

In [8]:
# full_dataset.sum(dim=1)

In [9]:
# full_dataset.sum(dim=0)

In [10]:
# is_note = full_dataset[:, 0] != 1

In [11]:
# is_note

In [12]:
# full_dataset[is_note, :].sum(dim=0)

In [13]:
# transposed = full_dataset.clone()

In [14]:
# pitch_idx = full_dataset[is_note, :].argmax(dim=1)
# transposed_idx = pitch_idx + 1

In [15]:
# transposed.data[is_note, pitch_idx] = 0
# transposed.data[is_note, transposed_idx] = 1
# transposed.sum(dim=0)

In [16]:
# temp = nn.LSTM(10, 10)

In [17]:
# list(temp.parameters())

In [18]:
# for param in temp.parameters():
#     if len(param.shape) > 1:
#         print(param.shape)
#         nn.init.xavier_normal_(param, gain=5/3)
# list(temp.parameters())

In [19]:
# help(slice)
# print(slice(10))
# print(slice(10).indices(5))
# print([1,2,3,4,5][None:])

In [20]:
# class SliceTester(object):
#     def __getitem__(self, *indices):
#         return indices

# class SliceTester(object):
#     def __getitem__(self, indices):
#         return indices

In [21]:
# sl = SliceTester()
# print(sl[:5])
# # len(sl[:5])
# print(sl[:5, 5:10])
# print(len(sl[:5, 5:10]))
# print(sl[:,:,:,:])
# print(len(sl[:,:,:,:]))
# print(sl[5])
# print(sl[5, 10])
# list(sl[:].indices(10))

In [22]:
# torch.randint(low=-3, high=3, size=(), dtype=torch.uint8)

In [23]:
# df = torch.zeros(1, len(chr_nums), 129)
# df.data[0, range(len(chr_nums)), chr_nums] = 1

In [24]:
class MyDataset(Dataset):
    def __init__(self, sequence_length, total_batch=-1, step_interval=1, random_choice=True, transpose=0):

        # invalid argument handling ---------------------------------------------
        if not isinstance(sequence_length, int):
            raise TypeError("value for sequence_length must be an integer")
        if not isinstance(total_batch, int):
            raise TypeError("value for total_batch must be an integer")
        if not isinstance(step_interval, int):
            raise TypeError("value for skip_interval must be an integer")
        if not isinstance(random_choice, bool):
            raise TypeError("value for random_choice must be a boolean")
        if not isinstance(transpose, int) or transpose < 0:
            raise ValueError("value for transpose must be a non-negative integer")
        # -----------------------------------------------------------------------

        super(MyDataset, self).__init__()
        self.data_string = open('./source/essence_text.txt', 'r', encoding='utf-8').read()
        
        # sequence_length check--------------------------------------------------
        if sequence_length < 1:
            self.sequence_length = len(self.data_string) - 1
        else:
            self.sequence_length = min(sequence_length, len(self.data_string) - 1)
        # -----------------------------------------------------------------------

        # total_batch and step_interval check------------------------------------
        if total_batch > 0:
            self.total_batch = min(total_batch, len(self.data_string) - self.sequence_length)
            self.step_interval = int((len(self.data_string) - self.sequence_length) / self.total_batch)
        elif step_interval > 1:
            self.step_interval = step_interval
            if (len(self.data_string) - self.sequence_length) % self.step_interval == 0:
                self.total_batch = (len(self.data_string) - self.sequence_length) // self.step_interval
            else:
                self.total_batch = ((len(self.data_string) - self.sequence_length) // self.step_interval) + 1
        else:
            self.total_batch = len(self.data_string) - self.sequence_length
            self.step_interval = 1
        # -----------------------------------------------------------------------
            
        self.random_choice = random_choice
        self.transpose = transpose
        
    def __len__(self):
        return self.total_batch
    
    def __getitem__(self, batch_idx):
        
        # invalid index handling -------------------------------------------------
        if not isinstance(batch_idx, int):
            raise IndexError(f"this dataset only takes one integer value as the index, but {type(batch_idx)} was given")
        if not batch_idx < self.total_batch:
            raise IndexError(f"index out of bounds (index > len)")
        if batch_idx < 0:
            batch_idx = self.total_batch + batch_idx
            if batch_idx < 0:
                raise IndexError(f"index out of bounds (index < -len)")
        # ------------------------------------------------------------------------


        # index selection --------------------------------------------------------
        if self.step_interval == 1:
            idx = batch_idx
        else:
            if self.random_choice:
                if batch_idx == self.total_batch - 1:
                    # This last index might have a wider range than other indices
                    idx = torch.randint(low=(self.step_interval * batch_idx),
                                        high=(len(self.data_string) - self.sequence_length),
                                        size=(),
                                        dtype=torch.int).item()
                else:
                    idx = torch.randint(low=(self.step_interval * batch_idx),
                                        high=(self.step_interval * (batch_idx + 1)),
                                        size=(),
                                        dtype=torch.int).item()
            else:
                idx = self.step_interval * batch_idx


        string = self.data_string[idx : (idx + self.sequence_length + 1)]

        input_chr_nums = [ord(chr) - ord('\n') for chr in string[:-1]]
        target_chr_nums = [ord(chr) - ord('\n') for chr in string[1:]]
        # -----------------------------------------------------------------------

        # transpose -------------------------------------------------------------
        if self.transpose > 0:
            min_note = min([chr_num for chr_num in input_chr_nums if chr_num != 0])
            lower_margin = min_note - 1
            max_note = max([chr_num for chr_num in input_chr_nums if chr_num != 0])
            upper_margin = 129 - max_note

            # In case the transposed value might go out of bounds
            if lower_margin < self.transpose:
                if upper_margin < self.transpose:
                    low = -lower_margin
                    high = upper_margin
                else:
                    low = lower_margin
                    high = min(upper_margin, 2 * self.transpose - lower_margin)
            elif upper_margin < self.transpose:
                low = -min(upper_margin, 2 * self.transpose - lower_margin)
                high = upper_margin
            # Else, everything is alright!
            else:
                low = -self.transpose
                high = self.transpose
                
            transpose_val = torch.randint(low=low, high=high + 1, size=[], dtype=torch.int).item()
            input_chr_nums = [chr_num + transpose_val if chr_num != 0 else chr_num for chr_num in input_chr_nums]
            target_chr_nums = [chr_num + transpose_val if chr_num != 0 else chr_num for chr_num in target_chr_nums]
        # ------------------------------------------------------------------------

        input_tensor = torch.zeros(self.sequence_length, 129)
        input_tensor[range(self.sequence_length), input_chr_nums] = 1

        target_tensor = torch.tensor(target_chr_nums)

        return input_tensor, target_tensor

In [25]:
dataset = MyDataset(400, total_batch=-1, step_interval=1, transpose=0)
print(len(dataset))
# dataset[:50]
dataset[0]

457098


(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54,
         73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54, 73,  0, 54,
         73,  0, 54, 73,  0, 54,  0, 54, 85,  0, 54, 85,  0, 54, 85,  0, 54, 85,
          0, 54, 85,  0, 54, 85,  0, 54, 85,  0, 54, 85,  0, 54, 85,  0, 54, 85,
          0, 54, 85,  0, 54, 85,  0, 54, 85,  0,  0, 58, 80,  0, 58, 80,  0, 58,
         80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58,
         80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58, 80,  0, 58,
          0, 58,  0, 58,  0, 58,  0, 58,  0, 58,  0, 58,  0, 58,  0, 58,  0, 58,
          0, 58,  0, 58,  0, 58,  0, 58,  0,  0, 57, 78,  0, 57, 78,  0, 57, 78,
         

In [26]:
loader = DataLoader(MyDataset(sequence_length=5, total_batch=-1, step_interval=1, transpose=5),
                    batch_size=2, shuffle=False, drop_last=False)
print(len(loader))
(lambda list_of_tensors: (list_of_tensors, list_of_tensors[0].shape, list_of_tensors[1].shape))(next(iter(loader)))

228747


([tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [1., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]],
  
          [[0., 0., 0.,  ..., 0., 0., 0.],
           [1., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [1., 0., 0.,  ..., 0., 0., 0.]]]), tensor([[73,  0, 54, 73,  0],
          [ 0, 49, 68,  0, 49]])], torch.Size([2, 5, 129]), torch.Size([2, 5]))

In [27]:
# def dataload(dataset, length, batch=1):
#     max_idx = (len(dataset) - 1) - ((len(dataset) - 1) % length)
#     ins = dataset[:max_idx].view(-1, length, 129)
#     tar = dataset[1:max_idx + 1].argmax(dim=2).view(-1, length)
#     for i in range(0, len(ins) // batch, batch):
#         indices = range(i * batch, (i + 1) * batch)
#         yield (ins[indices], tar[indices])

In [28]:
# list(dataload(dataset, 400, 2))[0]

In [29]:
class MyLSTM(nn.Module):
    def __init__(self):
        super(MyLSTM, self).__init__()
        self.lstm = nn.LSTM(129, 129, batch_first=True)
        self.out_f = nn.LogSoftmax(dim=2)
        
    def forward(self, input, hidden):
        output, hidden = self.lstm(input, hidden)
        output = self.out_f(output)
        
        return output, hidden
    
    def init_hidden(self, batch_size=1, cuda=False):
        h0 = torch.zeros(1, batch_size, 129)
        c0 = torch.zeros(1, batch_size, 129)
        
        if cuda:
            h0 = h0.cuda()
            c0 = c0.cuda()
        
        return (h0, c0)

In [30]:
# lstm = MyLSTM()

In [31]:
# inputs = [input_data for input_data, target_data in dataload(dataset, 500, batch=2)]
# targets = [target_data for input_data, target_data in dataload(dataset, 500, batch=2)]
# print(len(inputs), len(targets))
# print(inputs[0].shape, inputs[1].shape)
# print(targets[0].shape, targets[1].shape)

In [32]:
# hidden = lstm.init_hidden(batch_size=2)

In [33]:
# inputs[0].shape

In [34]:
# print(lstm.forward(inputs[0], hidden))
# print(lstm.forward(inputs[0], hidden)[0].shape)
# print(list(map(lambda tup: list(map(lambda t: t.shape, tup)), lstm.forward(inputs[0], hidden))))

In [35]:
# out = lstm.forward(inputs[0], hidden)[0]
# out.shape

In [36]:
# def manual_loss(output, target, avg_over_seq=True):
#     n_batch = output.shape[0]
#     l_sequence = output.shape[1]
#     total_loss = 0
#     if output.shape[:2] == target.shape[:2]:
#         for batch in range(n_batch):
#             for step in range(l_sequence):
#                 total_loss += nn.functional.nll_loss(output[batch, step, :].view(-1, 129).log(), target[batch, step].view(1))
#         if avg_over_seq:
#             avg_loss = total_loss / (n_batch * l_sequence)
#         else:
#             avg_loss = total_loss / n_batch
        
#         return avg_loss
#     else:
#         print("Dimension mismatch")
#         print(output.shape)
#         print(target.shape)

In [37]:
# manual_loss(lstm(inputs[0], hidden)[0], targets[0], avg_over_seq=True)

In [38]:
# print(nn.NLLLoss(reduction='none')(torch.log(lstm.forward(inputs[0], hidden)[0])[:, 0], targets[0][:, 0]))
# print(nn.NLLLoss(reduction='sum')(torch.log(lstm.forward(inputs[0], hidden)[0])[:, 0], targets[0][:, 0]))
# print(nn.NLLLoss(reduction='elementwise_mean')(torch.log(lstm.forward(inputs[0], hidden)[0])[:, 0], targets[0][:, 0]))
# print()
# print(nn.NLLLoss(reduction='none')(torch.log(lstm.forward(inputs[0], hidden)[0]).view(-1, 129), targets[0].view(-1)))
# print(nn.NLLLoss(reduction='sum')(torch.log(lstm.forward(inputs[0], hidden)[0]).view(-1, 129), targets[0].view(-1)))
# print(nn.NLLLoss(reduction='elementwise_mean')(torch.log(lstm.forward(inputs[0], hidden)[0]).view(-1, 129), targets[0].view(-1)))

In [39]:
# lstm.forward(inputs[0], hidden)[0].view(-1, 129).sum(dim=1)

In [40]:
class SkipLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0, batch_first=True):
        super(SkipLSTM, self).__init__()
        
        # Hyperparameters to be kept (Others are used only for initialization)
        self.hidden_size = hidden_size
        self.batch_first = batch_first
        
        
        self.layer_norm_0 = nn.LayerNorm(input_size)
        
        self.lstm_1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first)
        self.layer_norm_1 = nn.LayerNorm(hidden_size)
        self.dropout_1 = nn.Dropout(p=dropout)
        
        self.lstm_2 = nn.LSTM(input_size=hidden_size + input_size, hidden_size=hidden_size, batch_first=batch_first)
        self.layer_norm_2 = nn.LayerNorm(hidden_size)
        self.dropout_2 = nn.Dropout(p=dropout)
        
        self.lstm_3 = nn.LSTM(input_size=hidden_size + hidden_size, hidden_size=hidden_size, batch_first=batch_first)
        self.layer_norm_3 = nn.LayerNorm(hidden_size)
        self.dropout_3 = nn.Dropout(p=dropout)
        
        self.to_notes = nn.Linear(in_features=hidden_size + hidden_size, out_features=output_size)
#         self.activation = nn.Sigmoid()
        self.to_out = nn.LogSoftmax(dim=2)

#         for param in self.parameters():
#             if len(param.shape) == 2:
#                 nn.init.xavier_normal_(param)
 
    def forward(self, input, hiddens, temperature=1.0):
        assert temperature > 0
        
        input_norm = self.layer_norm_0(input)
        
        output_1, hidden_1 = self.lstm_1(input_norm, hiddens[0])
        output_1_norm = self.layer_norm_1(output_1)
        concat_1 = torch.cat((output_1_norm, input_norm), dim=2)
        dropped_1 = self.dropout_1(concat_1)
        
        output_2, hidden_2 = self.lstm_2(concat_1, hiddens[1])
        output_2_norm = self.layer_norm_2(output_2)
        concat_2 = torch.cat((output_2_norm, output_1_norm), dim=2)
        dropped_2 = self.dropout_2(concat_2)
        
        output_3, hidden_3 = self.lstm_3(concat_2, hiddens[2])
        output_3_norm = self.layer_norm_3(output_3)
        concat_3 = torch.cat((output_3_norm, output_2_norm), dim=2)
        dropped_3 = self.dropout_3(concat_3)
        
        output = self.to_notes(concat_3)
#         output = self.activation(output)
        output = output / temperature
        output = self.to_out(output)
        
        return output, (hidden_1, hidden_2, hidden_3)
    
    def init_hidden(self, batch_size, cuda=False):
        h0_1 = torch.zeros(1, batch_size, self.hidden_size)
        c0_1 = torch.zeros(1, batch_size, self.hidden_size)
        h0_2 = torch.zeros(1, batch_size, self.hidden_size)
        c0_2 = torch.zeros(1, batch_size, self.hidden_size)
        h0_3 = torch.zeros(1, batch_size, self.hidden_size)
        c0_3 = torch.zeros(1, batch_size, self.hidden_size)
        if cuda:
            h0_1 = h0_1.cuda()
            c0_1 = c0_1.cuda()
            h0_2 = h0_2.cuda()
            c0_2 = c0_2.cuda()
            h0_3 = h0_3.cuda()
            c0_3 = c0_3.cuda()

        return ((h0_1, c0_1), (h0_2, c0_2), (h0_3, c0_3))

In [41]:
class ProperLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0, batch_first=True):
        super(ProperLSTM, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=batch_first)
        self.to_notes = nn.Linear(in_features=hidden_size, out_features=output_size)
        self.activation = nn.Sigmoid()
        self.to_out = nn.LogSoftmax(dim=2)
        
        for param in self.parameters():
            if len(param.shape) == 2:
                nn.init.xavier_normal_(param)
    
    def forward(self, input, hidden):
        output, hidden = self.lstm(input, hidden)
        output = self.to_notes(output)
        output = self.activation(output)
        output = self.to_out(output)
        
        return output, hidden
    
    def init_hidden(self, batch_size, cuda=False):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        if cuda:
            h0 = h0.cuda()
            c0 = c0.cuda()
        
        return (h0, c0)

In [42]:
# my_lstm = ProperLSTM(input_size=129, hidden_size=300, output_size=129, num_layers=3, batch_first=True)
my_lstm = SkipLSTM(input_size=129, hidden_size=300, output_size=129, dropout=0.5, batch_first=True)

In [43]:
with torch.no_grad():
    print(nn.functional.nll_loss(my_lstm(MyDataset(1600)[0][0].unsqueeze(dim=0), my_lstm.init_hidden(1))[0].view(-1, 129).log(), MyDataset(1600)[0][1].view(-1)))
    # print(manual_loss(my_lstm(inputs[0], my_lstm.init_hidden(2))[0], targets[0]))

    print()
    temp = (torch.zeros(1,129, dtype=torch.float), torch.tensor([50]))
    temp[0].data[0, 50] = 1.
    print(nn.functional.nll_loss(temp[0].log(), temp[1]))

    print()
    temp[0].data[0, 50] = 0.
    temp[0].data[0, 51] = 1.
    print(nn.functional.nll_loss(temp[0].log(), temp[1]))

tensor(nan)

tensor(0.)

tensor(inf)


In [44]:
# optimizer = optim.SGD(my_lstm.parameters(), lr=1.0)
# optimizer = optim.SGD(my_lstm.parameters(), lr=0.01, momentum=0.9, weight_decay=1.0, nesterov=True)
# optimizer = optim.RMSprop(my_lstm.parameters(), lr=0.05, weight_decay=0, momentum=0.9, centered=False)
optimizer = optim.Adam(my_lstm.parameters(), lr=0.001, weight_decay=0.5, amsgrad=True)

In [45]:
epoch = 0

In [46]:
from time import time

In [47]:
loss_weights = torch.ones(129)
# loss_weights.data[0] = 0.1

In [83]:
my_lstm.train()
my_lstm.cuda()

loss_func = nn.NLLLoss(weight=loss_weights.cuda())

learning_rate = 0.2
optimizer.param_groups[0]['lr'] = learning_rate

batch_size = 16
sequence_length = 2000
transpose = 7

loader = DataLoader(MyDataset(sequence_length=sequence_length, total_batch=-1, step_interval=sequence_length, transpose=transpose),
                    batch_size=batch_size,
                    shuffle=True,
                    drop_last=True)
# short_data = MyDataset(sequence_length=1, total_batch=1, step_interval=1, random_choice=False, transpose=0)[0]
# print(short_data[0], short_data[1])

lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(loader) // 2)

# best_loss = {'epoch': epoch, 'loss': 9999, 'recall': 0.0, 'acc': 0.0, 'state_dict': None, 'opt_dict': None}
# best_recall = {'epoch': epoch, 'loss': 9999, 'recall': 0.0, 'acc': 0.0, 'state_dict': None, 'opt_dict': None}
# best_acc = {'epoch': epoch, 'loss': 9999, 'recall': 0.0, 'acc': 0.0, 'state_dict': None, 'opt_dict': None}
best_loss = torch.load('./3best_loss.pth', map_location='cuda')
best_recall = torch.load('./3best_recall.pth', map_location='cuda')
best_acc = torch.load('./3best_acc.pth', map_location='cuda')

# n_epoch = 500
# max_epoch = epoch + n_epoch

# while epoch < max_epoch:
#     for sequence, target in dataload(dataset, length=500, batch=batch_size):
#         my_lstm.zero_grad()
#         output, _ = my_lstm(sequence.cuda(), my_lstm.init_hidden(batch_size=batch_size, cuda=True))
#         loss = loss_func(output.view(-1, 129).log(), target.view(-1).cuda())
#         loss.backward()
#         optimizer.step()
#     print(f"Loss at epoch {epoch} : {loss}")
#     epoch += 1

train_duration = 3600 * 8 + 900 # in seconds
start_time = time()

while time() - start_time < train_duration:
    for sequence, target in loader:
#     for sequence, target in (short_data,):
#         sequence = sequence.view(1, sequence_length, 129)
#         target = target.view(1, sequence_length)
        sequence = sequence.cuda()
        target = target.cuda()
        my_lstm.zero_grad()
        output = my_lstm(sequence, my_lstm.init_hidden(batch_size=batch_size, cuda=True))[0]
        loss = loss_func(output.view(-1, 129), target.view(-1))
        if not torch.isfinite(loss):
            break
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    if not torch.isfinite(loss):
        print("Loss Exploded!")
        break
    overall_acc = (output.argmax(dim=2) == target).to(torch.float).mean().item()
    acc = ((output.argmax(dim=2) == target) & (target != 0)).to(torch.float).mean().item()
    with open('./train_log.txt', 'a') as log:
        log.write(f"Loss at epoch {epoch} : {loss}\n")
        log.write(f"Overall Accuracy : {overall_acc * 100:.3f}%\n")
        log.write(f"Note Accuracy (Except for timesteps) : {acc * 100:.3f}%\n\n")
    print(f"Loss at epoch {epoch} : {loss}")
    print(f"Overall Accuracy : {overall_acc * 100:.3f}%")
    print(f"Note Accuracy (Except for timesteps) : {acc * 100:.3f}%\n")
    
    if loss < best_loss['loss']:
        best_loss = {'epoch': epoch, 'loss': loss.item(), 'recall': acc, 'acc': overall_acc, 'state_dict': my_lstm.state_dict(), 'opt_dict': optimizer.state_dict}# , 'lr_dict': lr_scheduler.state_dict()}
        torch.save(best_loss, './3best_loss.pth')
    if acc > best_recall['recall']:
        best_recall = {'epoch': epoch, 'loss': loss.item(), 'recall': acc, 'acc': overall_acc, 'state_dict': my_lstm.state_dict(), 'opt_dict': optimizer.state_dict}# , 'lr_dict': lr_scheduler.state_dict()}
        torch.save(best_recall, './3best_recall.pth')
    if overall_acc > best_acc['acc']:
        best_acc = {'epoch': epoch, 'loss': loss.item(), 'recall': acc, 'acc': overall_acc, 'state_dict': my_lstm.state_dict(), 'opt_dict': optimizer.state_dict}# , 'lr_dict': lr_scheduler.state_dict()}
        torch.save(best_acc, './3best_acc.pth')
        
    epoch += 1


my_lstm.cpu()
torch.cuda.empty_cache()

Loss at epoch 6019 : 4.859124183654785
Overall Accuracy : 30.269%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6020 : 4.859155654907227
Overall Accuracy : 28.250%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6021 : 4.859206199645996
Overall Accuracy : 29.222%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6022 : 4.859200477600098
Overall Accuracy : 30.041%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6023 : 4.859216213226318
Overall Accuracy : 29.891%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6024 : 4.859172344207764
Overall Accuracy : 24.116%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6025 : 4.859299182891846
Overall Accuracy : 26.022%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6026 : 4.859194278717041
Overall Accuracy : 25.222%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6027 : 4.859189987182617
Overall Accuracy : 29.016%
Note Accuracy (Except for time

Loss at epoch 6092 : 4.859193801879883
Overall Accuracy : 32.247%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6093 : 4.859257698059082
Overall Accuracy : 29.769%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6094 : 4.859178066253662
Overall Accuracy : 25.141%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6095 : 4.8591742515563965
Overall Accuracy : 28.803%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6096 : 4.859167575836182
Overall Accuracy : 25.716%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6097 : 4.8592000007629395
Overall Accuracy : 26.119%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6098 : 4.859256744384766
Overall Accuracy : 28.569%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6099 : 4.85922384262085
Overall Accuracy : 29.966%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6100 : 4.8592400550842285
Overall Accuracy : 27.156%
Note Accuracy (Except for ti

Loss at epoch 6165 : 4.859234809875488
Overall Accuracy : 27.334%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6166 : 4.859205722808838
Overall Accuracy : 29.222%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6167 : 4.85921573638916
Overall Accuracy : 26.503%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6168 : 4.859196662902832
Overall Accuracy : 25.291%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6169 : 4.859262943267822
Overall Accuracy : 27.250%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6170 : 4.859163284301758
Overall Accuracy : 30.763%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6171 : 4.85922384262085
Overall Accuracy : 26.778%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6172 : 4.859165191650391
Overall Accuracy : 27.813%
Note Accuracy (Except for timesteps) : 0.000%

Loss at epoch 6173 : 4.859229564666748
Overall Accuracy : 24.719%
Note Accuracy (Except for timest

KeyboardInterrupt: 

In [81]:
torch.cuda.empty_cache()

In [68]:
best_loss

{'epoch': 5617,
 'loss': 3.8840205669403076,
 'recall': 0.633809506893158,
 'acc': 0.8883809447288513,
 'state_dict': OrderedDict([('layer_norm_0.weight',
               tensor([0.9376, 0.9211, 0.9197, 0.9209, 0.9194, 0.9191, 0.9204, 0.9202, 0.9197,
                       0.9203, 0.9197, 0.9197, 0.9213, 0.9192, 0.9197, 0.9198, 0.9201, 0.9186,
                       0.9201, 1.1771, 1.1676, 1.2862, 1.3071, 1.3103, 1.3289, 1.3573, 1.2834,
                       1.2631, 1.2652, 1.2184, 1.2577, 1.1763, 1.2060, 1.1810, 1.1863, 1.1479,
                       1.1786, 1.1539, 1.1146, 1.1040, 1.1189, 1.0869, 1.1486, 1.0739, 1.0943,
                       1.1018, 1.0869, 1.0967, 1.0863, 1.0649, 1.0706, 1.1000, 1.1254, 1.0878,
                       1.1235, 1.1177, 1.1070, 1.1432, 1.0984, 1.0547, 1.0867, 1.1031, 1.1141,
                       1.0679, 1.0898, 1.0910, 1.0698, 1.0645, 1.0540, 1.0756, 1.0822, 1.0532,
                       1.0463, 1.0586, 1.0357, 1.0992, 1.0544, 1.0512, 1.0561, 1.0807

In [69]:
best_recall

{'epoch': 5626,
 'loss': 3.8841054439544678,
 'recall': 0.6553428173065186,
 'acc': 0.8917142748832703,
 'state_dict': OrderedDict([('layer_norm_0.weight',
               tensor([0.9375, 0.9203, 0.9186, 0.9182, 0.9192, 0.9201, 0.9201, 0.9200, 0.9200,
                       0.9204, 0.9193, 0.9204, 0.9195, 0.9199, 0.9189, 0.9182, 0.9199, 0.9186,
                       0.9196, 1.1785, 1.1666, 1.2874, 1.3082, 1.3089, 1.3274, 1.3575, 1.2807,
                       1.2638, 1.2654, 1.2173, 1.2633, 1.1765, 1.2044, 1.1808, 1.1849, 1.1441,
                       1.1773, 1.1537, 1.1165, 1.1004, 1.1180, 1.0843, 1.1471, 1.0708, 1.0931,
                       1.1016, 1.0855, 1.0999, 1.0873, 1.0636, 1.0706, 1.1018, 1.1241, 1.0901,
                       1.1227, 1.1155, 1.1073, 1.1444, 1.0963, 1.0567, 1.0885, 1.1025, 1.1120,
                       1.0671, 1.0897, 1.0905, 1.0718, 1.0613, 1.0537, 1.0718, 1.0800, 1.0522,
                       1.0461, 1.0594, 1.0341, 1.0985, 1.0550, 1.0504, 1.0569, 1.080

In [70]:
best_acc

{'epoch': 5581,
 'loss': 3.8881218433380127,
 'recall': 0.6076475977897644,
 'acc': 0.868152379989624,
 'state_dict': OrderedDict([('layer_norm_0.weight',
               tensor([0.9389, 0.9213, 0.9208, 0.9209, 0.9198, 0.9202, 0.9207, 0.9199, 0.9201,
                       0.9204, 0.9199, 0.9204, 0.9205, 0.9196, 0.9199, 0.9200, 0.9205, 0.9199,
                       0.9216, 1.1737, 1.1679, 1.2877, 1.3077, 1.3104, 1.3245, 1.3574, 1.2822,
                       1.2615, 1.2682, 1.2247, 1.2582, 1.1765, 1.2059, 1.1812, 1.1833, 1.1510,
                       1.1820, 1.1552, 1.1147, 1.1038, 1.1176, 1.0862, 1.1501, 1.0750, 1.0937,
                       1.1020, 1.0895, 1.0995, 1.0853, 1.0648, 1.0727, 1.1059, 1.1244, 1.0899,
                       1.1263, 1.1164, 1.1094, 1.1444, 1.0988, 1.0563, 1.0852, 1.1046, 1.1151,
                       1.0668, 1.0891, 1.0897, 1.0712, 1.0643, 1.0564, 1.0768, 1.0830, 1.0512,
                       1.0524, 1.0610, 1.0351, 1.1017, 1.0557, 1.0550, 1.0632, 1.0833

In [71]:
my_lstm.cpu()
state = {'epoch': epoch, 'loss': loss.item(), 'recall': acc, 'acc': overall_acc, 'state_dict': my_lstm.state_dict(), 'opt_dict': optimizer.state_dict()}
state

{'epoch': 5852,
 'loss': 4.267449378967285,
 'recall': 0.11602499336004257,
 'acc': 0.30198749899864197,
 'state_dict': OrderedDict([('layer_norm_0.weight',
               tensor([0.1823, 0.1674, 0.1664, 0.1662, 0.1668, 0.1673, 0.1673, 0.1673, 0.1673,
                       0.1675, 0.1668, 0.1675, 0.1669, 0.1672, 0.1666, 0.1662, 0.1672, 0.1665,
                       0.1671, 0.3389, 0.3302, 0.4213, 0.4375, 0.4380, 0.4526, 0.4766, 0.4160,
                       0.4030, 0.4043, 0.3677, 0.4028, 0.3377, 0.3584, 0.3409, 0.3438, 0.3142,
                       0.3382, 0.3211, 0.2945, 0.2833, 0.2957, 0.2721, 0.3163, 0.2629, 0.2783,
                       0.2841, 0.2731, 0.2829, 0.2744, 0.2585, 0.2632, 0.2845, 0.3003, 0.2764,
                       0.2993, 0.2945, 0.2887, 0.3151, 0.2816, 0.2545, 0.2761, 0.2854, 0.2921,
                       0.2617, 0.2768, 0.2775, 0.2648, 0.2581, 0.2528, 0.2651, 0.2705, 0.2517,
                       0.2477, 0.2565, 0.2398, 0.2829, 0.2537, 0.2502, 0.2547, 0.27

In [72]:
torch.save(state, './3temp_checkpoint.pth')
# torch.save(state, './temp_checkpoint.pth')

In [50]:
my_lstm.cuda()
# state = torch.load('./temp_checkpoint.pth', map_location='cuda')
# state = torch.load('./3temp_checkpoint.pth', map_location='cuda')
# state = torch.load('./best.pth', map_location='cuda')
state = torch.load('./3best_recall.pth', map_location='cuda')
my_lstm.load_state_dict(state['state_dict'])
# optimizer.load_state_dict(state['opt_dict'])
epoch = state['epoch']
state

{'epoch': 5626,
 'loss': 3.8841054439544678,
 'recall': 0.6553428173065186,
 'acc': 0.8917142748832703,
 'state_dict': OrderedDict([('layer_norm_0.weight',
               tensor([0.9375, 0.9203, 0.9186, 0.9182, 0.9192, 0.9201, 0.9201, 0.9200, 0.9200,
                       0.9204, 0.9193, 0.9204, 0.9195, 0.9199, 0.9189, 0.9182, 0.9199, 0.9186,
                       0.9196, 1.1785, 1.1666, 1.2874, 1.3082, 1.3089, 1.3274, 1.3575, 1.2807,
                       1.2638, 1.2654, 1.2173, 1.2633, 1.1765, 1.2044, 1.1808, 1.1849, 1.1441,
                       1.1773, 1.1537, 1.1165, 1.1004, 1.1180, 1.0843, 1.1471, 1.0708, 1.0931,
                       1.1016, 1.0855, 1.0999, 1.0873, 1.0636, 1.0706, 1.1018, 1.1241, 1.0901,
                       1.1227, 1.1155, 1.1073, 1.1444, 1.0963, 1.0567, 1.0885, 1.1025, 1.1120,
                       1.0671, 1.0897, 1.0905, 1.0718, 1.0613, 1.0537, 1.0718, 1.0800, 1.0522,
                       1.0461, 1.0594, 1.0341, 1.0985, 1.0550, 1.0504, 1.0569, 1.080

In [102]:
generated_notes = [MyDataset(1, random_choice=False)[i][0].argmax().item() for i in range(500)]
# generated_notes = [70, 70, 70, 0, 70, 70, 70, 0, 69, 80]
start_len = len(generated_notes)

my_lstm.eval()
my_lstm.cuda()

hidden = my_lstm.init_hidden(batch_size=1, cuda=True)

with torch.no_grad():
    for _ in range(1000):
        input = torch.zeros(1, 1, 129, device='cuda')
        input.data[0, 0, generated_notes[-1]] = 1

        output, hidden = my_lstm(input, hidden, temperature=3.)
#         print(output)

        prediction = torch.multinomial(output.exp().view(1, 129), 1).item()
        generated_notes.append(prediction)
    torch.cuda.empty_cache()
print(generated_notes[start_len:])
# print(generated_notes)

[56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75, 87, 0, 56, 63, 75,

In [98]:
open('./generated3.txt', 'w', encoding='utf-8').write(''.join([chr(ord('\n') + num) for num in generated_notes[start_len:]]))

500

In [50]:
my_lstm.eval()
with torch.no_grad():
    my_lstm.cuda()
    predicted = []
    hidden = my_lstm.init_hidden(1, True)
    # count = 0
    process_length = 10000
    dataset_p = MyDataset(-1, total_batch=1, step_interval=1, random_choice=False)[0][0]
    for data in [dataset_p[i * process_length : (i + 1) * process_length] for i in range(0, torch.ceil(torch.tensor(len(dataset_p) / process_length)).to(torch.int).item(), 1)]:
        output, hidden = my_lstm(data.unsqueeze(0).cuda(), hidden)
        predicted.extend(output.argmax(dim=2).view(-1).cpu().tolist())
    #     count += 1
    #     if count == 100:
    #         break
    my_lstm.cpu()
    torch.cuda.empty_cache()
    print(predicted[:100])

[68, 0, 0, 0, 0, 0, 0, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 0, 54, 73, 54, 0, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 85, 0, 54, 61, 80, 0, 58, 80, 0, 58, 80, 0, 58, 80, 0, 58, 80, 0, 58, 80]


In [51]:
open('./predicted3.txt', 'w', encoding='utf-8').write(''.join([chr(ord('\n') + num) for num in predicted]))

457497

In [86]:
optimizer.state_dict()

{'state': {1540034079624: {'momentum_buffer': tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           ...,
           [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
          device='cuda:0')},
  1540034144368: {'momentum_buffer': tensor([[ 2.3477e-09, -1.0219e-08, -2.2433e-09,  ..., -1.3613e-08,
            -3.7946e-09, -8.5903e-09],
           [-3.0808e-09,  1.2355e-07,  5.8675e-08,  ...,  1.5693e-07,
             5.4997e-08,  1.0782e-07],
           [ 1.1489e-10,  5.5751e-08,  2.4654e-08,  ...,  7.2181e-08,
             2.4321e-08,  5.2192e-08],
           ...,
           [-4.2990e-09,  1.7401e-07,  8.7198e-08,  ...,  2.2825e-07,
             7.8105e-08,  

In [53]:
print(my_lstm)

SkipLSTM(
  (layer_norm_0): LayerNorm(torch.Size([129]), eps=1e-05, elementwise_affine=True)
  (lstm_1): LSTM(129, 300, batch_first=True)
  (layer_norm_1): LayerNorm(torch.Size([300]), eps=1e-05, elementwise_affine=True)
  (dropout_1): Dropout(p=0.3)
  (lstm_2): LSTM(429, 300, batch_first=True)
  (layer_norm_2): LayerNorm(torch.Size([300]), eps=1e-05, elementwise_affine=True)
  (dropout_2): Dropout(p=0.3)
  (lstm_3): LSTM(600, 300, batch_first=True)
  (layer_norm_3): LayerNorm(torch.Size([300]), eps=1e-05, elementwise_affine=True)
  (dropout_3): Dropout(p=0.3)
  (to_notes): Linear(in_features=600, out_features=129, bias=True)
  (activation): Sigmoid()
  (to_out): LogSoftmax()
)
