In [1]:
# Import the packages we will use after
import os
import copy
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import ast
import numpy as np

from zipfile import ZipFile
from google.colab import drive

import librosa
import librosa.display

plt.rcParams['figure.figsize'] = (17, 5)

from torch.utils.data import TensorDataset, DataLoader, Dataset
import torchvision.transforms as transforms
from torchvision import models
import torch.nn.functional as F
import torchaudio
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Load data from the google drive
drive.mount("/content/gdrive")
audio_data_path = "/content/gdrive/MyDrive/fma_small.zip"
metadata_path = "/content/gdrive/MyDrive/fma_metadata.zip"

with ZipFile(audio_data_path, 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

with ZipFile(metadata_path, 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

AUDIO_DIR = "fma_small"
METADATA_DIR = "fma_metadata"

def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))
            
        return tracks

Mounted at /content/gdrive


In [3]:
tracks = load(METADATA_DIR + os.sep + 'tracks.csv')
genres = load(METADATA_DIR + os.sep + 'genres.csv')
features = load(METADATA_DIR + os.sep + 'features.csv')
echonest = load(METADATA_DIR + os.sep + 'echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

In [65]:
# files with very short audio length
bad_file_indexes = [98565, 98567, 98569, 99134, 108925, 133297] 
print("tracks.shape: ", tracks.shape)
for idx in bad_file_indexes:
  tracks = tracks.drop(idx)

tracks.shape:  (106574, 52)


In [66]:
# Get small subset from meta data
small = tracks[tracks['set', 'subset'] <= 'small']
audio_idx = small.index
audio_labels = small[('track','genre_top')]

In [5]:
def get_audio_path(audio_dir, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.

    Examples
    --------
    >>> get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'

    """
    tid_str = '{:06d}'.format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')

In [7]:
hash_table = {'Hip-Hop':0, 'Pop':1, 'Folk':2, 'Rock':3, 'Experimental':4, 'International':5, 'Electronic':6, 'Instrumental':7}
def numeric_labels(audio_labels):
  for i in hash_table:
    audio_labels = audio_labels.replace(i, hash_table[i])
  return audio_labels

In [93]:
# Question 1
# Define a class in order to load data from the FMA database
# It will return a sample with a waveform and a label
class CustomAudioDataset(Dataset):
  def __init__(self, audio_dir, audio_labels):
    self.audio_dir = audio_dir
    self.audio_labels = audio_labels

  def __len__(self):
    return len(self.audio_labels)
  
  def __getitem__(self, idx):
    track_id = self.audio_labels.iloc[idx][0]
    label = self.audio_labels.iloc[idx][1]
    
    audio_path = get_audio_path(self.audio_dir, track_id)
    x, sample_rate = librosa.load(audio_path, sr = None, mono = True) # -> use librosa instead
    waveform = torch.tensor(x[None,:])
    transform = transforms.Compose([torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=4096, hop_length=4096//4), transforms.Resize((128, 1200))]) 
    melspectro = transform(waveform)

    return melspectro, label

In [94]:
# Question 2
# Create the train and test subsets from the small dataset
train_set = small[small['set','split'] == 'training'].reset_index()
test_set = small[small['set','split'] == 'test'].reset_index()

# Convert the genre name from string to digits
train_set_label = numeric_labels(train_set.loc[:, [('track_id', ''), ('track','genre_top')]])
test_set_label = numeric_labels(train_set.loc[:, [('track_id', ''), ('track','genre_top')]])

In [95]:
# Question 3
# Create datasets (train and test)
train_dataset = CustomAudioDataset(audio_dir=AUDIO_DIR, audio_labels=train_set_label)
test_dataset = CustomAudioDataset(audio_dir=AUDIO_DIR, audio_labels=test_set_label)

# Create dataloaders(train and test)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

"""
# Iterate through the DataLoader
train_features, train_label = next(iter(train_dataloader))
print(f"Feature batch shape:{train_features.size()}")
print(f"Label batch shape:{train_label.size()}")
"""

'\n# Iterate through the DataLoader\ntrain_features, train_label = next(iter(train_dataloader))\nprint(f"Feature batch shape:{train_features.size()}")\nprint(f"Label batch shape:{train_label.size()}")\n'

In [105]:
# Question 4
# 4.1 Define the model
class CNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.flatten = nn.Flatten()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.conv2 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3)
    self.linear1 = nn.Linear(in_features=8940, out_features=64, bias=True)
    self.linear2 = nn.Linear(in_features=64, out_features=32, bias=True)
    self.linear3 = nn.Linear(in_features=32, out_features=16, bias=True)
    self.linear4 = nn.Linear(in_features=16, out_features=8, bias=True)
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self, x):
    x = self.conv1(x)
    x = self.pool( F.relu(x) )
    x = self.conv2(x)
    x = self.pool( F.relu(x) )
    x = self.flatten(x)
    x = self.linear1(x)
    x = self.linear2(x)
    x = self.linear3(x)
    x = self.linear4(x)
    x = self.softmax(x)

    return x

net = CNN()

In [106]:
# 4.2 Test the methode forward with a sample to make sure it works
input,label = train_dataset.__getitem__(2)
predict = net.forward(input)
predict



tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.9836e-17, 3.0829e-44,
         1.0000e+00, 0.0000e+00]], grad_fn=<SoftmaxBackward0>)

In [107]:
# Question 5
# Define the train_optim and train loop
def train_optim(model, epochs, log_frequency, device):
  # We assume that the test set plays the role of a validation set

  model.to(device) # we make sure the model is on the proper device

  # Multiclass classification setting, we use cross-entropy
  # note that this implementation requires the logits as input 
  # logits: values prior softmax transformation 
  loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')

  learning_rate = 1e-4

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  
  for t in range(epochs):

      model.train() # we specify that we are training the model

      # At each epoch, the training set will be processed as a set of batches
      for batch_id,  batch in enumerate(train_dataloader) : 

        images, labels  = batch

        # we put the data on the same device
        images, labels = images.to(device), labels.to(device)  
        
        y_pred = model(images) # forward pass output=logits

        loss = loss_fn(y_pred, labels)

        if batch_id % log_frequency == 0:
            print("epoch: {:03d}, batch: {:03d}, loss: {:.3f} ".format(t+1, batch_id+1, loss.item()))

        optimizer.zero_grad() # clear the gradient before backward
        loss.backward()       # update the gradient

        optimizer.step() # update the model parameters using the gradient

      # Model evaluation after each step computing the accuracy
      model.eval()
      total = 0
      correct = 0
      for batch_id, batch in enumerate(test_dataloader):
        images , labels = batch
        images , labels = images.to(device), labels.to(device)
        y_pred = model(images) # forward computes the logits
        sf_y_pred = torch.nn.Softmax(dim=1)(y_pred) # softmax to obtain the probability distribution
        _, predicted = torch.max(sf_y_pred , 1)     # decision rule, we select the max
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
      
      print("[validation] accuracy: {:.3f}%\n".format(100 * correct / total))

In [None]:
# Question 6
# Start the train and test loops
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
train_optim(net, epochs=3, log_frequency=60, device=device)

In [117]:
# Question 7
# Make some predictions on some examples
input,label_real = train_dataset.__getitem__(3)
predict = net.forward(input)
print("The predict is: ")
print(predict)
print("Real label is: " + str(label_real))



The predict is: 
tensor([[1.4656e-38, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00]], grad_fn=<SoftmaxBackward0>)
Real label is: 2
