In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import matplotlib.pyplot as plt
import librosa
import librosa.display
from tqdm import tqdm
import json
import os
from sklearn.model_selection import train_test_split

In [2]:
seed = 1
torch.manual_seed(seed)

<torch._C.Generator at 0x129552150>

In [3]:
class ChordDetector(Dataset):
    def __init__(self, train:bool, chord_template:dict = json.load(open('./chord_templates.json')), data_location:str = './data/', sr = 44100, hop = 256):
        super(ChordDetector, self).__init__()
        self.chord_template = chord_template
        self.data = []
        self.sr = sr
        self.hop = hop
        for file in os.listdir(data_location):
            chord_true = torch.Tensor(self.chord_template[self._extract_chord_name(file)])
            y, sr = librosa.load(data_location+file, sr = sr)
            chroma = torch.Tensor(librosa.feature.chroma_cens(y=y, sr = sr, hop_length=hop)).T
            self.data.append((chroma, chord_true))
        X_train, X_test, _, _ = train_test_split(self.data, self.data, test_size=0.2, random_state=seed)
        if train:
            self.data = X_train
        else:
            self.data = X_test

    def _extract_chord_name(self, file):
        main = file[:file.index('-')]
        if file[file.index('-')+2]=='i':
            return main+'m'
        return main
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [4]:
train_data = ChordDetector(train=True)
test_data = ChordDetector(train=False)

In [5]:
train_loader = DataLoader(
    train_data, 
    batch_size=64,
    shuffle=True
)
test_loader = DataLoader(
    test_data, 
    batch_size=64,
    shuffle=True
)

In [6]:
class GRU(nn.Module):
    def __init__(self, input_size = 12, hidden_size = 100, num_layers = 1, num_classes = 12, bidirectional = True) -> None:
        super(GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, bidirectional=bidirectional)
        if(bidirectional):
            self.fc = nn.Linear(hidden_size*2, num_classes)
        else:
            self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        if(self.bidirectional):
            h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size)
        else:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.gru(x, h0)
        out = out[:,-1,:] # Since we only want the output of the last cell
        out = self.fc(out)
        return(out)

In [7]:
model = GRU()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, 0.995)

In [8]:
EPOCHS = 100
device = torch.device('cpu')

In [9]:
for epoch in range(EPOCHS):
    for i, (chroma, labels) in enumerate(train_loader):
        chroma = chroma.to(device)
        labels = labels.to(device)

        outputs = model(chroma)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f'Epoch {epoch+1}/{EPOCHS}; Loss: {loss.item():.4f}')

Epoch 1/100; Loss: 7.5137
Epoch 2/100; Loss: 7.4233
Epoch 3/100; Loss: 7.1956
Epoch 4/100; Loss: 6.9799
Epoch 5/100; Loss: 6.7120
Epoch 6/100; Loss: 6.7776
Epoch 7/100; Loss: 6.7541
Epoch 8/100; Loss: 6.7944
Epoch 9/100; Loss: 6.3630
Epoch 10/100; Loss: 6.7973
Epoch 11/100; Loss: 6.3375
Epoch 12/100; Loss: 5.4183
Epoch 13/100; Loss: 5.3071
Epoch 14/100; Loss: 4.7477
Epoch 15/100; Loss: 4.4834
Epoch 16/100; Loss: 4.3284
Epoch 17/100; Loss: 4.0348
Epoch 18/100; Loss: 4.1537
Epoch 19/100; Loss: 3.7965
Epoch 20/100; Loss: 3.9470
Epoch 21/100; Loss: 3.8282
Epoch 22/100; Loss: 3.7731
Epoch 23/100; Loss: 3.6572
Epoch 24/100; Loss: 3.6196
Epoch 25/100; Loss: 3.4905
Epoch 26/100; Loss: 3.4854
Epoch 27/100; Loss: 3.4821
Epoch 28/100; Loss: 3.4250
Epoch 29/100; Loss: 3.3878
Epoch 30/100; Loss: 3.3859
Epoch 31/100; Loss: 3.3904
Epoch 32/100; Loss: 3.3512
Epoch 33/100; Loss: 4.1166
Epoch 34/100; Loss: 3.4996
Epoch 35/100; Loss: 3.5634
Epoch 36/100; Loss: 3.5258
Epoch 37/100; Loss: 3.4822
Epoch 38/1

In [10]:
def predict(model, audio, chroma_req = True, chord_templates:dict = json.load(open('./chord_templates.json')), sr = 44100, hop = 256):
    if chroma_req:
        chroma = torch.Tensor(librosa.feature.chroma_cens(y=audio, sr = sr, hop_length=hop)).T.unsqueeze(0)
    else:
        chroma = audio
    with torch.no_grad():
        outputs = nn.functional.softmax(model(chroma), 1)[0]
    min_val = 120
    min_key = ''
    for key, val in chord_templates.items():
        out = torch.norm(torch.Tensor(val) - outputs)
        if min_val >= out:
            min_val = out
            min_key = key
    return min_key

In [11]:
y = librosa.load('./data/F#-Minor-9.wav')[0]
predict(model, y)

'F#m'

In [12]:
def extract_chord_name(file):
    main = file[:file.index('-')]
    if file[file.index('-')+2]=='i':
        return main+'m'
    return main

In [13]:
chord_templates:dict = json.load(open('./chord_templates.json'))

In [14]:
count = 0
for y in tqdm(test_data):
    pred = predict(model, y[0].unsqueeze(0), False)
    min_val = 120
    min_key = ''
    for key, val in chord_templates.items():
        out = torch.norm(torch.Tensor(val) - y[1])
        if min_val >= out:
            min_val = out
            min_key = key
    count+=(pred==min_key)
print(f"Accuracy = {100*count/len(test_data):.2f}%")

  0%|          | 0/58 [00:00<?, ?it/s]

100%|██████████| 58/58 [00:01<00:00, 29.95it/s]

Accuracy = 94.83%





In [15]:
torch.save(model.state_dict(), './models/chord_detector.pth')