In [1]:
# To hold the python code for a blank CNN model

In [4]:
# Runs the transformer from simple_transformer


# Make sure we can import on the path we are on
import sys
sys.path.append('/workspace/fourth_year_project/MusicGen')

from MyAudioDataset import MyAudioDataset
from AudioCodesDataset import AudioCodesDataset
from audiocraft.models import CompressionModel
from audiocraft.models.encodec import InterleaveStereoCompressionModel




# Compression model, shortens 10secs to 8,500
model = CompressionModel.get_pretrained('facebook/encodec_32khz')
comp_model = InterleaveStereoCompressionModel(model).cuda()

#mydataset = MyAudioDataset('/workspace/small_model_data3', 'recording_01_')
audio_codes_dataset = AudioCodesDataset(comp_model)
audio_codes_dataset.load_data('90_degree_compress_tensors_10sec_augmented.pkl')
#audio_codes_dataset.set_audio_dataset(mydataset)

assert len(audio_codes_dataset) == 5130, "Dataset is not the right size"



In [85]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

class AudioCNN(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(AudioCNN, self).__init__()
        # self.conv1 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=1)
        # self.bn1 = nn.BatchNorm1d(16)
        # self.relu1 = nn.ReLU()
        # self.dropout1 = nn.Dropout(dropout_rate)
        
        # self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        # self.bn2 = nn.BatchNorm1d(32)
        # self.relu2 = nn.ReLU()
        # self.dropout2 = nn.Dropout(dropout_rate)
        
        # self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        # self.bn3 = nn.BatchNorm1d(64)
        # self.relu3 = nn.ReLU()
        # self.dropout3 = nn.Dropout(dropout_rate)
        
        # self.conv4 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)
        # self.bn4 = nn.BatchNorm1d(128)
        # self.relu4 = nn.ReLU()
        # self.dropout4 = nn.Dropout(dropout_rate)
        
        # self.conv5 = nn.Conv1d(128, 8, kernel_size=1, stride=1, padding=0)
        # self.bn5 = nn.BatchNorm1d(8)
        # self.relu5 = nn.ReLU() 
        # self.dropout5 = nn.Dropout(dropout_rate)
        self.conv1 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(16)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm1d(64)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.conv4 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm1d(128)
        self.relu4 = nn.ReLU()
        self.dropout4 = nn.Dropout(dropout_rate)
        
        self.conv5 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)  # Increased filters
        self.bn5 = nn.BatchNorm1d(256)  # Increased filters
        self.relu5 = nn.ReLU()
        self.dropout5 = nn.Dropout(dropout_rate)

        

        
        # self.fc1 = nn.Linear(, 1024)  # Adjust this
        # self.fc2 = nn.Linear(1024, 8*500)  # Output size is [8, 500]
        self.fc1 = nn.Linear(256*500, 1024)  # Added fully connected layer
        self.fc2 = nn.Linear(1024, 8*500)  # Added fully connected layer

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.dropout4(x)
        
        x = self.conv5(x)
        x = self.bn5(x)
        x = self.relu5(x)
        x = self.dropout5(x)

        #print(x.size())

        x = x.view(x.size(0), -1)  # Flatten the tensor
        #print(x.size())  # Print the size of the tensor here
        x = self.fc1(x)
        x = self.fc2(x)
        #print(x.size())
        x = x.view(-1, 8, 500)
        #print(x.size())
        
        
        return x
    def train_loop(self, dataset, batch_size, epochs, lr):
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = optim.Adam(self.parameters(), lr=lr)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
        criterion = nn.MSELoss()

        for epoch in range(epochs):
            for i, (_, targets, _, inputs, angle, sr) in enumerate(dataloader):
                #inputs, targets = batch
                inputs, targets = inputs.cuda(), targets.cuda()

                optimizer.zero_grad()
                outputs = self(inputs)
                #print(outputs.shape, targets.shape)
                #print(outputs[0,i,:10], targets[0,i,:10])
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
            
            scheduler.step()

            if epoch % 5 == 0:
                print(f'Epoch: {epoch}, Loss: {loss.item()}, LR: {scheduler.get_last_lr()[0]}')
            
            # Save the model every 10 epochs
            if epoch % 10 == 0:
                torch.save(self.state_dict(), f'model_epoch_{epoch}.pt')

In [1]:
# What does our data look like?

#audio_codes_dataset.data_map[0]

In [88]:
# Create transformer
myTransformer = AudioCNN(dropout_rate=0.1).cuda()
myTransformer.train()


AudioCNN(
  (conv1): Conv1d(8, 16, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.1, inplace=False)
  (conv2): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.1, inplace=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (dropout3): Dropout(p=0.1, inplace=False)
  (conv4): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu4): ReLU()
  (dropout4): Dropout(p=0.1, inplace=False)
  (conv5): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn5): BatchNorm1d(256, ep

In [89]:
# Runs our training function
myTransformer.train_loop(dataset=audio_codes_dataset, batch_size=4, epochs=600, lr=0.01)

Epoch: 0, Loss: 19.983251571655273, LR: 0.1
Epoch: 10, Loss: 0.6055828928947449, LR: 0.1
Epoch: 20, Loss: 0.5853830575942993, LR: 0.1
Epoch: 30, Loss: 0.33640792965888977, LR: 0.010000000000000002
Epoch: 40, Loss: 0.49098488688468933, LR: 0.010000000000000002
Epoch: 50, Loss: 0.4325709939002991, LR: 0.010000000000000002
Epoch: 60, Loss: 0.4615749716758728, LR: 0.0010000000000000002
Epoch: 70, Loss: 0.6343490481376648, LR: 0.0010000000000000002
Epoch: 80, Loss: 0.7377737760543823, LR: 0.0010000000000000002
Epoch: 90, Loss: 0.7034205794334412, LR: 0.00010000000000000003
Epoch: 100, Loss: 0.5588409900665283, LR: 0.00010000000000000003
Epoch: 110, Loss: 0.627485990524292, LR: 0.00010000000000000003
Epoch: 120, Loss: 0.4313850402832031, LR: 1.0000000000000004e-05
Epoch: 130, Loss: 0.46716687083244324, LR: 1.0000000000000004e-05
Epoch: 140, Loss: 0.5009002685546875, LR: 1.0000000000000004e-05
Epoch: 150, Loss: 0.36022865772247314, LR: 1.0000000000000004e-06
Epoch: 160, Loss: 0.43968313932418

KeyboardInterrupt: 