In [22]:
import pandas as pd
import os 
import numpy as np  
import librosa 
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torch  
import torch.nn as nn   
from typing import List
import pytorch_lightning as pl  
import torch.nn.functional as F 

In [23]:
audio_dataframe = pd.read_csv('label_dataframe.csv')    

In [43]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, root_dir):
        self.dataframe = dataframe['file_name']
        self.root_dir = root_dir
        self.root_note = dataframe['root_note']

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Ensure correct indexing
        file_name = self.dataframe.iloc[idx]
        root_note = self.root_note.iloc[idx]
        # Construct the file path correctly
        chromagram_path = os.path.join(self.root_dir, file_name.split('.wav')[0] + '_chromagram.npy')
        
        try:
            data = torch.Tensor(np.load(chromagram_path))
        except FileNotFoundError as e:
            print(f"Error: File not found: {chromagram_path}")
            raise e
        except Exception as e:
            print(f"An error occurred while loading the file: {chromagram_path}")
            raise e
        
        return data, file_name, root_note
    # def __len__(self):
    #     return len(self.dataframe)

    # def __getitem__(self, idx):
    #     # chromagram_path = os.path.join(self.root_dir, self.dataframe.iloc[idx][0].split('.wav')[0] + '_chromagram.npy')
    #     # return np.load(chromagram_path)
    #     file_name = self.dataframe.iat[idx, 0]
    #     chromagram_path = os.path.join(self.root_dir, file_name.split('.wav')[0] + '_chromagram.npy')
    #     return np.load(chromagram_path)

In [44]:
columns_to_drop = ['label', 'instrument', 'type_of_sound', 'chord_type']

X = audio_dataframe.drop(columns=columns_to_drop, axis=1)
y = audio_dataframe['root_note']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

for index, row in X_train.iterrows(): 
    try:
        chroma_oath = os.path.join('IDMT-SMT-CHORDS/chromagrams', row[0].split('.wav')[0] + '_chromagram.npy')
        #print(chroma_oath)
    except:
        print('gasdf') 


train_dataloader = CustomDataset(dataframe=X_train, root_dir='IDMT-SMT-CHORDS/chromagrams')
test_dataloader = CustomDataset(dataframe=X_test, root_dir='IDMT-SMT-CHORDS/chromagrams')
val_dataloader = CustomDataset(dataframe=X_val, root_dir='IDMT-SMT-CHORDS/chromagrams')

  chroma_oath = os.path.join('IDMT-SMT-CHORDS/chromagrams', row[0].split('.wav')[0] + '_chromagram.npy')


In [45]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dims, encoded_dim):
        super(Encoder, self).__init__()
        layers = []
        for h_dim in hidden_dims:
            layers.append(nn.Sequential(
                nn.Linear(input_dim, h_dim),
                nn.ReLU(),
                nn.BatchNorm1d(h_dim)
            ))
            input_dim = h_dim
        layers.append(nn.Linear(input_dim, encoded_dim))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


class Decoder(nn.Module):
    def __init__(self, encoded_dim, hidden_dims, output_dim):
        super(Decoder, self).__init__()
        layers = []
        hidden_dims.reverse()
        for h_dim in hidden_dims:
            layers.append(nn.Sequential(
                nn.Linear(encoded_dim, h_dim),
                nn.ReLU(),
                nn.BatchNorm1d(h_dim)
            ))
            encoded_dim = h_dim
        layers.append(nn.Linear(encoded_dim, output_dim))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


class Autoencoder(pl.LightningModule):
    def __init__(self, input_dim=12, hidden_dims=None, encoded_dim=12, learning_rate=1e-4):
        super(Autoencoder, self).__init__()
        if hidden_dims is None:
            hidden_dims = [128, 64, 32]
        self.encoder = Encoder(input_dim, hidden_dims, encoded_dim)
        self.decoder = Decoder(encoded_dim, hidden_dims, input_dim)
        self.learning_rate = learning_rate

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def training_step(self, batch, batch_idx):
        inputs, _, _ = batch
        inputs = inputs.view(inputs.size(0), -1)
        outputs = self.forward(inputs)
        loss = nn.MSELoss()(outputs, inputs)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, _, _ = batch
        inputs = inputs.view(inputs.size(0), -1)
        outputs = self.forward(inputs)
        loss = nn.MSELoss()(outputs, inputs)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def encode(self, x): 
        return self.encoder(x)

In [46]:
# Hyperparameters
input_dim = 12  # For MNIST dataset
hidden_dims = [128, 64, 32]
encoded_dim = 12
learning_rate = 1e-4
batch_size = 64
num_epochs = 10

columns_to_drop = ['label', 'instrument', 'type_of_sound', 'chord_type']

X = audio_dataframe.drop(columns=columns_to_drop, axis=1)
y = audio_dataframe['root_note']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

train_dataset = CustomDataset(dataframe=X_train, root_dir='IDMT-SMT-CHORDS/chromagrams')
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

test_dataset = CustomDataset(dataframe=X_test, root_dir='IDMT-SMT-CHORDS/chromagrams')
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

val_dataset = CustomDataset(dataframe=X_val, root_dir='IDMT-SMT-CHORDS/chromagrams')
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

full_dataset= CustomDataset(dataframe=X, root_dir='IDMT-SMT-CHORDS/chromagrams')
full_dataloader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False)


Training step

In [47]:

autoencoder = Autoencoder(input_dim, hidden_dims, encoded_dim, learning_rate)

# Training
trainer = pl.Trainer(max_epochs=num_epochs)
trainer.fit(autoencoder, train_dataloader, val_dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 12.8 K
1 | decoder | Decoder | 12.8 K
------------------------------------
25.7 K    Trainable params
0         Non-trainable params
25.7 K    Total params
0.103     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/homebrew/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/opt/homebrew/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [48]:
# Inference to get encoded data
autoencoder.eval()  # Set model to evaluation mode

encoded_data_list = []
file_names_list = []
root_note_list = []
# Iterate over DataLoader
for batch in full_dataloader:
    batch_data, file_names, root_notes = batch
    batch_data = batch_data.view(batch_data.size(0), -1).float()
    with torch.no_grad():
        encoded_batch = autoencoder.encode(batch_data)
    encoded_data_list.append(encoded_batch.cpu().numpy())
    file_names_list.extend(file_names)
    root_note_list.extend(root_notes)
# Concatenate all encoded data
encoded_data = np.concatenate(encoded_data_list, axis=0)

# Save to DataFrame with filenames
encoded_df = pd.DataFrame(encoded_data, columns=[f'encoded_{i}' for i in range(encoded_dim)])
encoded_df.insert(0, 'file_name', file_names_list)
encoded_df.insert(1, 'root_note', root_note_list)
print(encoded_df.head())  # Display the first few rows of the DataFrame

# Save the DataFrame to a CSV file
encoded_df.to_csv('encoded_data.csv', index=False)

  file_name root_note  encoded_0  encoded_1  encoded_2  encoded_3  encoded_4  \
0  0000.wav         C   0.618062   0.891722   0.282112   0.146800   0.478934   
1  0001.wav         C  -0.322870   0.314766   0.694106  -0.305864  -0.163472   
2  0002.wav         C   0.635033   0.434493   1.508799  -0.677667   0.232426   
3  0003.wav        C#  -0.569648  -0.249012   0.913353   0.658161   1.618873   
4  0004.wav        C#   0.937316   0.078704   3.566514  -0.076888   2.564118   

   encoded_5  encoded_6  encoded_7  encoded_8  encoded_9  encoded_10  \
0  -1.229736   0.094583  -0.302654  -1.085481  -1.637345    0.959388   
1   0.502251  -0.084436  -0.158671  -0.008823  -0.672769    0.965743   
2  -1.076412   0.082173  -0.665262  -0.818527  -1.500026    0.785322   
3   0.392390   0.976987  -0.953056   0.643535   0.577639   -0.267624   
4   1.069315  -0.471510   0.013139  -0.240945   2.792057   -1.551140   

   encoded_11  
0   -0.334417  
1   -0.136161  
2   -1.133469  
3   -0.066476  
4   -0

In [None]:

# # Encoding
# encoded_data = autoencoder.encoder(sample_data)
# print("Encoded data:", encoded_data)

# # Decoding
# decoded_data = autoencoder.decoder(encoded_data)
# print("Decoded data:", decoded_data)

Saving the weights of the autoencoder

In [11]:
torch.save(autoencoder.state_dict(), 'autoencoder.pth')