## Audio Embedding

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle

class LinearVAE(nn.Module):
    def __init__(self, latent_dim=150, beta=0):
        super().__init__()
        self.latent_dim = latent_dim
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
        self.criterion = torch.nn.MSELoss()
        self.beta = beta
        
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
 
    def forward(self, x):
        z, mu, log_var = self.encode(x) # encoding
        reconstruction = self.decoder(z) # decoding
        return reconstruction, mu, log_var
    
    def encode(self, x):
        q = self.encoder(x)
        q = q.reshape(-1, 2, self.latent_dim)
        mu = q[:, 0, :] # the first feature values as mean
        log_var = q[:, 1, :] # the other feature values as variance
        z = self.reparameterize(mu, log_var)
        return z, mu, log_var
    
    def loss(self, x, rec, mu, log_var):
        # compute reconstruction loss
        rec_loss = self.criterion(x, rec)
        # compute KL divergence loss
        log_sigma = 0.5*log_var
        mu_unit = torch.zeros_like(mu)
        log_sigma_unit = torch.zeros_like(log_sigma)
        kl_loss = kl_divergence(mu, log_sigma, mu_unit, log_sigma_unit)
        kl_loss = torch.sum(kl_loss,axis=1) # sum across the latent dimension, not the batch dimension
        kl_loss = torch.mean(kl_loss) # make sure that this is a scalar, not a vector / array 

        return rec_loss + self.beta * kl_loss, {'rec_loss': rec_loss.cpu().detach().numpy(), 'kl_loss': kl_loss.cpu().detach().numpy()}

class Encoder(nn.Module):
    def __init__(self, latent_dim=150):
        super().__init__()
        self.latent_dim = latent_dim
        self.net = nn.Sequential(
            nn.Linear(in_features=11, out_features=100),
            nn.LeakyReLU(),
            nn.Linear(in_features=100, out_features=100),
            nn.LeakyReLU(),
            nn.Linear(in_features=100, out_features=latent_dim*2)
        )
    def forward(self, x):
        return self.net(x) 
    
 
class Decoder(nn.Module):
    def __init__(self, latent_dim=150):
        super().__init__()
        self.latent_dim = latent_dim
        self.net = nn.Sequential(
            nn.Linear(in_features=latent_dim, out_features=100),
            nn.LeakyReLU(),
            nn.Linear(in_features=100, out_features=100),
            nn.LeakyReLU(),
            nn.Linear(in_features=100, out_features=11),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x) 
    
def kl_divergence(mu1, log_sigma1, mu2, log_sigma2):
  """Computes KL[p||q] between two Gaussians defined by [mu, log_sigma]."""
  return (log_sigma2 - log_sigma1) + (torch.exp(log_sigma1) ** 2 + (mu1 - mu2) ** 2) \
               / (2 * torch.exp(log_sigma2) ** 2) - 0.5


### Train VAE

In [2]:
feature_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

def load_data(filepaths):
    data = []
    for filepath in filepaths:
        print(f'loading {filepath}..')
        # df = pd.read_csv('Data/audio/audio_features.txt',sep='\t')
        df = pd.read_csv(filepath,sep='\t')
        data.append(df)
    data = pd.concat(data).drop_duplicates()
    return data

def preprocessing_features(data):    
    feats = np.array(data[feature_columns],dtype=float)
    # features = (features - np.mean(features, axis=0))/np.std(features, axis=0)
    feats = (feats - np.min(feats, axis=0))/np.ptp(feats, axis=0)
    print(feats.shape, feats.dtype)
    return feats

In [3]:
filepaths = ['audio_features.txt', 'audio_features2.txt', 'audio_features3.txt']
model_path = 'models/'

In [3]:
feats_df = load_data(filepaths)
feats = preprocessing_features(feats_df)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

batch_size = 64
epoch = 20
beta = 0.001

data_loader = torch.utils.data.DataLoader(feats, batch_size=batch_size, shuffle=True, num_workers=4)
model = LinearVAE(latent_dim=150, beta=beta).to(device)
opt = torch.optim.Adam(model.parameters(),lr=1e-3)          # create optimizer instance
# criterion = torch.nn.MSELoss()

train_it = 0
for ep in range(epoch):
    print(f'Epoch {ep+1}/{epoch}')
    for sample_img in data_loader:
        # print(sample_img.float().shape)
        opt.zero_grad()
        rec, mu, log_var = model.forward(sample_img.float().to(device))
        total_loss, losses = model.loss(sample_img.float().to(device), rec, mu, log_var)
        total_loss.backward()
        opt.step()
        
        if train_it % 100 == 0:
            print("It {}: Total Loss: {}, \t Rec Loss: {},\t KL Loss: {}"\
        .format(train_it, total_loss, losses['rec_loss'], losses['kl_loss']))
        train_it += 1
print("Done!")

loading audio_features.txt..
loading audio_features2.txt..
loading audio_features3.txt..
(174506, 11) float64
cuda:0


  cpuset_checked))


Epoch 1/20
It 0: Total Loss: 0.11547081172466278, 	 Rec Loss: 0.11462929099798203,	 KL Loss: 0.8415201902389526
It 100: Total Loss: 0.04320327937602997, 	 Rec Loss: 0.03847368806600571,	 KL Loss: 4.729591369628906
It 200: Total Loss: 0.02804698795080185, 	 Rec Loss: 0.019029593095183372,	 KL Loss: 9.01739501953125
It 300: Total Loss: 0.026251042261719704, 	 Rec Loss: 0.017772559076547623,	 KL Loss: 8.478483200073242
It 400: Total Loss: 0.0231500044465065, 	 Rec Loss: 0.015054198913276196,	 KL Loss: 8.095804214477539
It 500: Total Loss: 0.024560749530792236, 	 Rec Loss: 0.016720639541745186,	 KL Loss: 7.840109825134277
It 600: Total Loss: 0.024955380707979202, 	 Rec Loss: 0.016731757670640945,	 KL Loss: 8.223621368408203
It 700: Total Loss: 0.02462039887905121, 	 Rec Loss: 0.016472183167934418,	 KL Loss: 8.148216247558594
It 800: Total Loss: 0.02466500923037529, 	 Rec Loss: 0.015490460209548473,	 KL Loss: 9.17454719543457
It 900: Total Loss: 0.02359658107161522, 	 Rec Loss: 0.0146433524

FileNotFoundError: ignored

In [5]:
torch.save(model.state_dict(), model_path+'/vae.p')

### Saving embeddings

In [14]:
data_path = 'Data/MPD_Large/'
feats_df = load_data([data_path + '/audio_features.txt'])
feats = preprocessing_features(feats_df)
feats

loading Data/MPD_Large//audio_features.txt..
(70229, 11) float64


array([[0.62246964, 0.11411411, 0.72727273, ..., 0.16616617, 0.40423387,
        0.42732022],
       [0.39473684, 0.31031031, 0.45454545, ..., 0.37937938, 0.58870968,
        0.74964421],
       [0.5111336 , 0.25825826, 1.        , ..., 0.12012012, 0.64616935,
        0.59545437],
       ...,
       [0.67307692, 0.83783784, 1.        , ..., 0.41941942, 0.57056452,
        0.35915692],
       [0.55060729, 0.86086086, 0.36363636, ..., 0.37637638, 0.16229839,
        0.49420817],
       [0.66396761, 0.11311311, 0.18181818, ..., 0.07717718, 0.31350806,
        0.33188485]])

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model = LinearVAE(latent_dim=150, beta=0.01).to(device)
model.load_state_dict(torch.load(model_path+'/vae.p'))
z, mu, log_var = model.encode(torch.Tensor(feats).to(device))

cuda:0


In [16]:
pickle.dump(z, open(data_path + '/audio_embeddings.p','wb'))

### Fine-tuning with genre classification

In [9]:
from genre_utils import *

data_path = 'data.txt'
features_path = 'audio_features.txt'

In [10]:
class EncoderFC(nn.Module):
    def __init__(self, latent_dim, n_class, encoder_weights=None, finetune=True):
        super().__init__()
        self.latent_dim = latent_dim
        self.encoder = Encoder(latent_dim)
        
        if encoder_weights:
            self.encoder.load_state_dict(encoder_weights)
            
        if finetune:
            self.encoder.train()
        else:
            self.encoder.eval()
            
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(latent_dim, 100),
            nn.LeakyReLU(),
            nn.Linear(100, n_class)
        )
        self.sigm = nn.Sigmoid()
    
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
    
    def encode(self, x):
        x = self.encoder(x)
        x = x.reshape(-1, 2, self.latent_dim)
        mu = x[:, 0, :] # the first feature values as mean
        log_var = x[:, 1, :] # the other feature values as variance
        z = self.reparameterize(mu, log_var)
        return z
 
    def forward(self, x):
        # encoding
        z = self.encode(x)
        logits = self.classifier(z)
        return self.sigm(logits)

In [11]:
# Load data
data = pd.read_csv(data_path,sep='\t')
assert 'spotify_id' in data.columns
assert 'genre' in data.columns, 'need genre column where each song genre is in a|b|c|d format'
data.spotify_id.astype(str)
feats_df = pd.read_csv(features_path,sep='\t')
print(len(data),len(feats_df))

Load data...


In [12]:
# preprocess features, genres
feats_df.id.astype(str)
feats_df = feats_df[['id'] + feature_columns]
feats_df = feats_df.rename(columns={'id':'spotify_id'})
data = merge_df(data, feats_df)
feats = preprocessing_features(data) 
# print(data)
genres = get_genre_from_df(data, sep='|')
genre_list = get_genre_list(genres)
genre_onehot = create_multilabel_onehot(genres,genre_list)
# print(len(genres),len(genre_list))
# assert len(feats) == len(genre_onehot), f'{len(feats)} != {len(genre_onehot)}'


merged length: 89156
(89156, 11) float64
{'unknown': 0, 'rock': 1, 'metal': 2, 'pop': 3, 'punk': 4, 'alternative': 5, 'post': 6, 'folk': 7, 'jazz': 8, 'rap': 9, 'electro': 10, 'soul': 11, 'melodic': 12, 'experimental': 13, 'death': 14, 'funk': 15, 'hop': 16, 'christian': 17, 'industrial': 18, 'indie': 19, 'hardcore': 20, 'house': 21, 'power': 22, 'noise': 23, 'new': 24, 'art': 25, 'progressive': 26, 'trap': 27, 'dance': 28, 'music': 29, 'hip': 30}
(89156, 31)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def calculate_metrics(pred, target, threshold=0.5):
    pred = np.array(pred >= threshold, dtype=float)
    return {
        'accuracy': accuracy_score(y_true=target, y_pred=pred)
    }

In [15]:
batch_size = 64
epoch = 5
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

# Load model
# Load VAE encoder weights
print('Load model...')
model_vae = LinearVAE(latent_dim=150, beta=beta).to(device)
model_vae.load_state_dict(torch.load(model_path+'/vae.p'))
model_ft = EncoderFC(150, len(genre_list), model_vae.encoder.state_dict())
model_ft.to(device)
model_ft.train()

data_loader_ft = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.Tensor(feats),torch.Tensor(genre_onehot)), batch_size=batch_size, shuffle=True, num_workers=1)
opt = torch.optim.Adam(model_ft.parameters(),lr=1e-3)          # create optimizer instance
criterion = torch.nn.BCELoss()

train_it = 0
results = []
for ep in range(epoch):
    print(f'Epoch {ep+1}/{epoch}')
    preds = []
    trues = []
    for batch_x, batch_y in data_loader_ft:
        opt.zero_grad()
        output = model_ft.forward(batch_x.float().to(device))
        rec_loss = criterion(output, batch_y.float().to(device))
        rec_loss.backward()
        opt.step()
        
        preds.extend(output.cpu().detach().numpy())
        trues.extend(batch_y.cpu().detach().numpy())
        
        if train_it % 1000 == 0:
            print("It {}: Loss: {}".format(train_it, rec_loss))
        train_it += 1
    
    result = calculate_metrics(np.array(preds), np.array(trues))
    print(result['accuracy'])
print("Done!")


cuda:0
Load model...
Epoch 1/5
It 0: Loss: 0.7155431509017944
It 1000: Loss: 0.15980269014835358
0.31477410381802684
Epoch 2/5
It 2000: Loss: 0.13862667977809906
0.3210215801516443
Epoch 3/5
It 3000: Loss: 0.1741984784603119
It 4000: Loss: 0.129915252327919
0.32249091480102293
Epoch 4/5
It 5000: Loss: 0.15029728412628174
0.32201983040961907
Epoch 5/5
It 6000: Loss: 0.14682887494564056
0.32360132800933195
Done!


In [16]:
torch.save(model_ft.state_dict(), model_path+'/audio_ft.p')