<a href="https://colab.research.google.com/github/the-crHack/email/blob/main/HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt install unzip

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unzip is already the newest version (6.0-26ubuntu3.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!pip install py_midicsv==4.1.2
!pip install midi_player==0.5.1

Collecting py_midicsv==4.1.2
  Downloading py_midicsv-4.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting rich-click<2.0.0,>=1.8.3 (from py_midicsv==4.1.2)
  Downloading rich_click-1.8.5-py3-none-any.whl.metadata (7.9 kB)
Downloading py_midicsv-4.1.2-py3-none-any.whl (16 kB)
Downloading rich_click-1.8.5-py3-none-any.whl (35 kB)
Installing collected packages: rich-click, py_midicsv
Successfully installed py_midicsv-4.1.2 rich-click-1.8.5
Collecting midi_player==0.5.1
  Downloading midi_player-0.5.1-py3-none-any.whl.metadata (2.2 kB)
Downloading midi_player-0.5.1-py3-none-any.whl (6.4 kB)
Installing collected packages: midi_player
Successfully installed midi_player-0.5.1


In [None]:
!unzip /content/sample_data/train-20241205T181153Z-001.zip -d /content/sample_data

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import glob
import numpy as np
import torch.nn.functional as F



In [None]:

class SongsDataset(Dataset):
    def __init__(self, files, context_window=64, max_samples_per_song=100, stride=4):
        """
        Parameters:
        - files: List of song file paths.
        - context_window: Number of previous events to include in each sample.
        - max_samples_per_song: Maximum number of samples to extract from each song.
        - stride: Step size for sliding the context window (reduces overlap).
        """
        self.context_window = context_window
        self.data = []
        self.labels = []

        for file in files:
            song_data = torch.load(file)

            # Calculate possible start indices with stride
            indices = range(0, len(song_data) - context_window, stride)

            # Randomly sample up to max_samples_per_song indices
            sampled_indices = random.sample(list(indices), min(max_samples_per_song, len(indices)))

            # Create context and label pairs
            for i in sampled_indices:
                self.data.append(song_data[i:i + context_window])
                self.labels.append(song_data[i + context_window])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [None]:
class NotePredictionModel(nn.Module):
    def __init__(self, context_window=64, note_dim=4):
        super(NotePredictionModel, self).__init__()
        self.lstm = nn.LSTM(input_size=note_dim, hidden_size=128, num_layers=2, batch_first=True)
        self.fc = nn.Linear(128, 134)  # Output: [µ_t, σ_t, µ_d, σ_d, log_probs_n, µ_v, σ_v]

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = hidden[-1]  # Use the last layer's hidden state
        output = self.fc(hidden)
        return output

In [None]:
def handle_nan_inf(tensor, default_value=0):
    tensor = torch.where(torch.isnan(tensor), torch.tensor(default_value, dtype=tensor.dtype), tensor)
    tensor = torch.where(torch.isinf(tensor), torch.tensor(default_value, dtype=tensor.dtype), tensor)
    return tensor

def negative_log_likelihood(output, target):
    # Unpack outputs
    mu_t, sigma_t, mu_d, sigma_d, logits_n, mu_v, sigma_v = output.split([1, 1, 1, 1, 128, 1, 1], dim=1)
    t, d, n, v = target.split([1, 1, 1, 1], dim=1)

    # Ensure sigma is positive by adding a small epsilon
    epsilon = 1e-6
    sigma_t = torch.clamp(sigma_t, min=epsilon)
    sigma_d = torch.clamp(sigma_d, min=epsilon)
    sigma_v = torch.clamp(sigma_v, min=epsilon)

    # Compute losses for t, d, v (Gaussian distribution)
    loss_t = 0.5 * torch.log(sigma_t ** 2) + ((t - mu_t) ** 2) / (2 * sigma_t ** 2)
    loss_d = 0.5 * torch.log(sigma_d ** 2) + ((d - mu_d) ** 2) / (2 * sigma_d ** 2)
    loss_v = 0.5 * torch.log(sigma_v ** 2) + ((v - mu_v) ** 2) / (2 * sigma_v ** 2)

    # Categorical cross-entropy for note value (logits for 128 categories)
    # Make sure the target `n` is of type Long (for cross entropy)
    loss_n = F.cross_entropy(logits_n, n.squeeze().long())  # Cast `n` to long

    return loss_t.mean() + loss_d.mean() + loss_v.mean() + loss_n.mean()


# Gradient Clipping to Avoid Exploding Gradients
def clip_gradients(model, max_norm=1.0):
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)


In [None]:
# Main Training Loop
def train(model, data_loader, num_epochs=10, learning_rate=1e-4, gradient_clip=1.0):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0
        for step, (context, target) in enumerate(data_loader):
            optimizer.zero_grad()

            # Handle NaN/Inf in the input data
            context = handle_nan_inf(context)
            target = handle_nan_inf(target)

            # Forward pass
            output = model(context.float())


            # Handle NaN/Inf in the output
            output = handle_nan_inf(output)

            # Calculate loss
            loss = negative_log_likelihood(output, target.float())
            total_loss += loss.item()

            # Backward pass
            loss.backward()

            # Handle NaN/Inf gradients
            for param in model.parameters():
                if param.grad is not None:
                    param.grad = handle_nan_inf(param.grad)

            # Clip gradients to avoid exploding gradients
            # clip_gradients(model, max_norm=gradient_clip)

            # Update weights
            optimizer.step()

            # Print loss every 100 steps
            if step % 100 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Step {step}/{len(data_loader)}, Loss: {loss.item()}")

        print(f"Epoch {epoch+1} completed, Total Loss: {total_loss / len(data_loader)}")


In [None]:
# Load training data
train_files = glob.glob("/content/sample_data/train/*/*/*/*.pt")
dataset = SongsDataset(train_files, context_window=64, max_samples_per_song=100, stride=3)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate model
model = NotePredictionModel()

# Start training
train(model, data_loader, num_epochs=10, learning_rate=1e-5)

In [None]:

# Helper function: Normalize inputs
def normalize(data):
    data = data.float()  # Ensure the input is in floating-point format
    mean = data.mean(dim=0)
    std = data.std(dim=0)

    # Handle cases where std is zero by replacing NaN values with 1
    std[std == 0] = 1.0  # Avoid division by zero
    normalized_data = (data - mean) / std

    # Replace any remaining NaNs with 0
    normalized_data = torch.where(torch.isnan(normalized_data), torch.tensor(0.0, dtype=data.dtype), normalized_data)

    return normalized_data


# Helper function: Sanitize tensors to replace NaNs or Inf
def sanitize_tensor(tensor, default_value=0.0):
    tensor = torch.where(torch.isnan(tensor), torch.tensor(default_value, dtype=tensor.dtype), tensor)
    tensor = torch.where(torch.isinf(tensor), torch.tensor(default_value, dtype=tensor.dtype), tensor)
    return tensor


class SongsDataset(Dataset):
    def __init__(self, files, context_window=64, max_samples_per_song=100, stride=4):
        self.context_window = context_window
        self.data = []
        self.labels = []

        for file in files:
            song_data = torch.load(file)
            indices = range(0, len(song_data) - context_window, stride)
            sampled_indices = random.sample(list(indices), min(max_samples_per_song, len(indices)))

            for i in sampled_indices:
                self.data.append(song_data[i:i + context_window])
                self.labels.append(song_data[i + context_window])

        # Calculate global min/max for scaling
        all_data = torch.cat(self.data, dim=0)
        self.min_values = all_data.min(dim=0).values
        self.max_values = all_data.max(dim=0).values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data[idx].float()
        target = self.labels[idx].float()

        # Normalize context and target using min-max scaling
        context = (context - self.min_values) / (self.max_values - self.min_values)
        target = (target - self.min_values) / (self.max_values - self.min_values)

        return context, target



class NotePredictionModel(nn.Module):
    def __init__(self, context_window=64, note_dim=4):
        super(NotePredictionModel, self).__init__()
        self.lstm = nn.LSTM(input_size=note_dim, hidden_size=128, num_layers=2, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(128, 134)  # Output: [µ_t, σ_t, µ_d, σ_d, logits_n, µ_v, σ_v]

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # Shape: [batch_size, sequence_length, hidden_size]
        lstm_out = torch.mean(lstm_out, dim=1)  # Mean pooling across time steps
        output = self.fc(lstm_out)  # Shape: [batch_size, 134]
        return output

def normal_nll_loss(output, target):
    # Unpack model outputs
    mu_t, sigma_t, mu_d, sigma_d, logits_n, mu_v, sigma_v = output.split([1, 1, 1, 1, 128, 1, 1], dim=1)

    # Unpack targets
    t, d, n, v = target.split([1, 1, 1, 1], dim=1)

    # Ensure sigma values are in a stable range
    epsilon = 1e-2
    max_value = 10.0
    sigma_t = torch.clamp(sigma_t, min=epsilon, max=max_value)
    sigma_d = torch.clamp(sigma_d, min=epsilon, max=max_value)
    sigma_v = torch.clamp(sigma_v, min=epsilon, max=max_value)

    # Time, duration, and volume losses
    loss_t = 0.5 * torch.log(sigma_t ** 2) + ((t - mu_t) ** 2) / (2 * sigma_t ** 2)
    loss_d = 0.5 * torch.log(sigma_d ** 2) + ((d - mu_d) ** 2) / (2 * sigma_d ** 2)
    loss_v = 0.5 * torch.log(sigma_v ** 2) + ((v - mu_v) ** 2) / (2 * sigma_v ** 2)

    # Note loss (Categorical Cross-Entropy)
    loss_n = F.cross_entropy(logits_n, n.squeeze().long())

    # Total loss
    return loss_t.mean() + loss_d.mean() + loss_v.mean() + loss_n.mean()


# Gradient Clipping to Avoid Exploding Gradients
def clip_gradients(model, max_norm=1.0):
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)


# # Initialize model weights
# def initialize_weights(m):
#     if isinstance(m, nn.Linear):
#         nn.init.xavier_uniform_(m.weight)
#         if m.bias is not None:
#             nn.init.zeros_(m.bias)


def train(model, data_loader, num_epochs=10, learning_rate=1e-6, gradient_clip=1.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.apply(initialize_weights)

    for epoch in range(num_epochs):
        total_loss = 0
        for step, (context, target) in enumerate(data_loader):
            optimizer.zero_grad()

            # Forward pass
            context = sanitize_tensor(context, default_value=0.0)
            target = sanitize_tensor(target, default_value=0.0)
            output = model(context.float())
            output = sanitize_tensor(output, default_value=0.0)

            if epoch == 0 and step == 0:
              print(f"Initial Predictions (mu_t): {output[:, 0].detach().flatten()[:5]}")
              print(f"Initial Predictions (sigma_t): {output[:, 1].detach().flatten()[:5]}")
              print(f"Targets (t): {target[:, 0].flatten()[:5]}")


            # Calculate loss
            loss = normal_nll_loss(output, target.float())
            total_loss += loss.item()

            # Backward pass
            loss.backward()

            # Clip gradients to avoid exploding gradients
            clip_gradients(model, max_norm=gradient_clip)

            # Update weights
            optimizer.step()

            # Print loss every 100 steps
            if step % 100 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Step {step}/{len(data_loader)}, Loss: {loss.item()}")

        print(f"Epoch {epoch+1} completed, Total Loss: {total_loss / len(data_loader)}")

# # Main script to run training
# if __name__ == "__main__":
    # Load training data
train_files = glob.glob("/content/sample_data/train/*/*/*/*.pt")
dataset = SongsDataset(train_files, context_window=64, max_samples_per_song=100, stride=4)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate model
model = NotePredictionModel()

# Start training
train(model, data_loader, num_epochs=10, learning_rate=1e-5)


In [None]:
first_batch = next(iter(data_loader))
inputs, labels = first_batch
print(inputs[0])
print(labels[0])


RuntimeError: Trying to override a python impl for DispatchKey.Meta on operator aten::broadcast_tensors

In [None]:

class SongsDataset(Dataset):
    def __init__(self, files, context_window=64, stride=4, max_samples_per_song=100):
        self.data = []  # List to store input sequences (X)
        self.labels = []  # List to store corresponding labels (Y)

        # Iterate over each song file
        for file in files:
            # Load the song data (assuming it's stored as a tensor)
            song_data = torch.load(file)  # song_data should be a tensor with shape (num_events, 4)

            # Create indices for sliding window
            indices = range(0, len(song_data) - context_window, stride)
            sampled_indices = random.sample(list(indices), min(max_samples_per_song, len(indices)))

            # Extract data slices and labels
            for i in sampled_indices:
                # Input sequence: slice of notes with size `context_window`
                self.data.append(song_data[i:i + context_window])  # Shape: (context_window, 4)
                # Label: the next note after the context window
                self.labels.append(song_data[i + context_window])  # Shape: (1, 4)

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.data)

    def __getitem__(self, idx):
        # Return the input (X) and label (Y) for the given index
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)


# --- Model Definition ---

class NotePredictionModel(nn.Module):
    def __init__(self):
        super(NotePredictionModel, self).__init__()

        # Define layers: A simple 2-layer LSTM followed by a fully connected layer
        self.lstm = nn.LSTM(input_size=4, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 134)  # We predict 134 values: [µ_t, σ_t, µ_d, σ_d, log(π0), ..., log(π127), µ_v, σ_v]

    def forward(self, x):
      # LSTM layer
      lstm_out, (h_n, c_n) = self.lstm(x)

      # Take the last LSTM output
      last_output = lstm_out[:, -1, :]  # (batch_size, hidden_size)

      # Fully connected layer to predict the required values
      output = self.fc(last_output)  # Shape: (batch_size, 134)

      # Split the output into the predicted values
      # Expecting 134 values for [µ_t, σ_t, µ_d, σ_d, log(π0), ..., log(π127), µ_v, σ_v]
      µ_t, σ_t, µ_d, σ_d, logits_n, µ_v, σ_v = output.split([1, 1, 1, 1, 128, 1, 1], dim=-1)


      # Return the predicted values as a tensor of shape (batch_size, 134)
      return torch.cat((µ_t, σ_t, µ_d, σ_d, logits_n, µ_v, σ_v), dim=-1)


def nll_loss_continuous(pred_mu, pred_sigma, target):
    # Avoid division by zero and log(0) errors
    epsilon = 1e-6
    pred_sigma = torch.max(pred_sigma, torch.tensor(epsilon))  # Prevent log(0)

    # Calculate the NLL for normal distribution
    loss = 0.5 * torch.log(2 * torch.tensor(torch.pi)) + torch.log(pred_sigma) + (target - pred_mu) ** 2 / (2 * pred_sigma ** 2)
    return loss.mean()


def train_model(model, data_loader, epochs, learning_rate):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for step, (context, target) in enumerate(data_loader):

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            output = model(context)

            # Split the output into predicted values (this should be 134 values)
            µ_t_pred, σ_t_pred, µ_d_pred, σ_d_pred, logits_n_pred, µ_v_pred, σ_v_pred = output.split([1, 1, 1, 1, 128, 1, 1], dim=-1)
            logits_n_pred = F.softmax(logits_n_pred, dim=-1)

            # Calculate the loss for continuous distributions using NLL
            t_loss = nll_loss_continuous(µ_t_pred, σ_t_pred, target[:, 0])
            d_loss = nll_loss_continuous(µ_d_pred, σ_d_pred, target[:, 1])
            v_loss = nll_loss_continuous(µ_v_pred, σ_v_pred, target[:, 2])

            # Calculate the categorical cross entropy for the note value logits
            nll_loss = nn.CrossEntropyLoss()(logits_n_pred, target[:, 3].long())

            # Total loss
            loss = t_loss + d_loss + v_loss + nll_loss

            # Print loss every 100 steps
            if step % 100 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Step {step}/{len(data_loader)}, Loss: {loss.item()}")


            # Backward pass
            loss.backward()

            # Optimize the model
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(data_loader)}")


# Example of file list with paths (you need to replace these with actual paths)
train_files = glob.glob("/content/sample_data/train/*/*/*/*.pt")  # Replace with your actual data files

# Initialize the dataset and dataloader
context_window = 64  # Size of the context window
stride = 4  # Step size
max_samples_per_song = 250  # Max samples per song

dataset = SongsDataset(train_files, context_window=context_window, stride=stride, max_samples_per_song=max_samples_per_song)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# --- Model Training ---

# Initialize the model
model = NotePredictionModel()

# Train the model
train_model(model, data_loader, epochs=10, learning_rate=0.01)


  song_data = torch.load(file)  # song_data should be a tensor with shape (num_events, 4)
  return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)


Epoch 1/10, Step 0/2988, Loss: 2.6093972445619814e+17
Epoch 1/10, Step 100/2988, Loss: 4107.1298828125
Epoch 1/10, Step 200/2988, Loss: 3699.735107421875
Epoch 1/10, Step 300/2988, Loss: 5917.50634765625
Epoch 1/10, Step 400/2988, Loss: 2125.960693359375
Epoch 1/10, Step 500/2988, Loss: 2472.532958984375
Epoch 1/10, Step 600/2988, Loss: 2478.284912109375
Epoch 1/10, Step 700/2988, Loss: 1541.404052734375
Epoch 1/10, Step 800/2988, Loss: 456.9059143066406
Epoch 1/10, Step 900/2988, Loss: 373.4855651855469
Epoch 1/10, Step 1000/2988, Loss: 630.6001586914062
Epoch 1/10, Step 1100/2988, Loss: 379.1097717285156
Epoch 1/10, Step 1200/2988, Loss: 589.2061767578125
Epoch 1/10, Step 1300/2988, Loss: 939.0614013671875
Epoch 1/10, Step 1400/2988, Loss: 1060.5928955078125
Epoch 1/10, Step 1500/2988, Loss: 1187.989990234375
Epoch 1/10, Step 1600/2988, Loss: 257.06390380859375
Epoch 1/10, Step 1700/2988, Loss: 273.1893005371094
Epoch 1/10, Step 1800/2988, Loss: 413.1756591796875
Epoch 1/10, Step 190