# Жанровая классификация аудио

In [None]:
import json
import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import librosa

import torch
import torch.mps
import tensorflow as tf
import keras

import torchsummary as ts

from functools import reduce
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from keras.backend import clear_session
from keras.callbacks import TensorBoard, ModelCheckpoint, CSVLogger, BackupAndRestore

from utils import label2vec

%load_ext autoreload
%autoreload 2

Проверим, что GPU доступно для вычислений

In [None]:
print(tf.config.list_physical_devices())
print(f'MPS is available: {torch.backends.mps.is_available()}')
print(f'MPS is built: {torch.backends.mps.is_built()}')

## Фильтрация метаданных

In [None]:
DATA_DIR = './data/fma_small'
METADATA_DIR = './data/fma_metadata/'

mp3_files = glob.glob(DATA_DIR + '/*/*.mp3')
mp3_names = list(map(lambda f: np.int64(f.split('/')[-1].split('.')[0]), mp3_files))

raw_tracks = pd.read_csv(METADATA_DIR + 'raw_tracks.csv')
tracks = raw_tracks[raw_tracks['track_id'].isin(mp3_names)]

## Сбор признаков, полученных с помощью `librosa`

В качестве метаданных для аудио будем использовать уже собранный набор данных

In [None]:
features_df = pd.read_csv(METADATA_DIR + 'features.csv', index_col=0, header=[0, 1, 2])
features_df = features_df[features_df.index.isin(mp3_names)]

features = np.unique(list(map(lambda x: x[0], list(features_df.columns))))

print(f"Features available: {features}")
print(f"Total: {len(features)}")

features_df

## Отбор признаков

Рассмотрим всю имеющуюся информацию о треках

In [None]:
tracks.columns

Оценим число непустых значений тегов

In [None]:
tracks['tags'].map(lambda x: None if x == '[]' else x).notnull().value_counts()

Подсчитаем число уникальных тегов

In [None]:
unique_tags = reduce(lambda tags, l: tags.union(eval(l)), tracks['tags'], set())
print(len(unique_tags))

Оставим предположительно полезную информацию из набора данных. Убедимся
в её необходимости позже.

In [None]:
to_keep = [
  'track_id', "album_id", "artist_id", "track_duration", 
  "track_genres", "track_instrumental", "track_interest", "track_listens",
]

filtered_tracks = tracks[to_keep]
filtered_tracks

Преобразуем время в секунды

In [None]:
def duration_to_int(t):
  splitted = t.split(":")
  
  return int(splitted[0]) * 60 + int(splitted[1])

filtered_tracks.loc[:,'track_duration'] = filtered_tracks.track_duration.apply(duration_to_int)
filtered_tracks

Узнаем количество жанров для треков

In [None]:
genres = filtered_tracks['track_genres'].map(lambda x: json.loads(x.replace("'", "\"")))
genre_ids = genres.map(lambda x: list(map(lambda y: y['genre_id'], x)))
genre_ids.map(lambda x: len(x)).value_counts()

Определим базовые жанры для каждого трека

In [None]:
all_genres = pd.read_csv(METADATA_DIR + 'genres.csv')

base_genres = genre_ids.map(lambda x: all_genres[all_genres.genre_id == int(x[0])].iloc[0].top_level)

filtered_tracks['track_genres'] = base_genres
filtered_tracks

In [None]:
base_genres.value_counts()

Получили 8 сбалансированных классов

In [None]:
def display_corr(df):
  corr = df.corr()
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  mask = np.triu(np.ones_like(corr, dtype=bool))
  sns.heatmap(corr, mask=mask, cmap=cmap)
  
display_corr(filtered_tracks)

Жанр трека очень плохо коррелирует с его длительностью, поэтому исключим
этот признак из рассмотрения

In [None]:
filtered_tracks = filtered_tracks.drop('track_duration', axis=1)

Теперь добавим значения, предпосчитанные с помощью `librosa`

In [None]:
merged = features_df.merge(filtered_tracks, how='inner', on='track_id')

display_corr(merged)

Конечно, признаков слишком много. Из всех возьмем признаки с наибольшей по
модулю корреляцией.

Для этого отсортируем признаки по степени корреляции

In [None]:
correlation = merged.corr()

genres_corr = correlation['track_genres'].sort_values(key=lambda x: np.abs(x), ascending=False)
genres_corr

Изобразим распределение значений корреляции

In [None]:
sns.histplot(genres_corr)

Видно, что наибольшее число признаков имеют почти нулевую корреляцию.
В связи с этим выберем наиболее информативные из них

In [None]:
boundary = 0.2

selected = merged[genres_corr[abs(genres_corr) > boundary].reset_index()['index']]
selected.set_index('track_id', inplace=True)

Кроме того, удалим сильно коррелирующие друг с другом нецелевые признаки,
оставив среди таких пар те, что больше коррелируют с целевым

In [None]:
c = selected.corr()

to_be_excluded = set()

boundary = 0.9

for i in c:
  for j in c:
    if abs(c[i][j]) > boundary and i != j and i != 'track_genres' and j != 'track_genres':
      least_informative = i if c['track_genres'][i] < c['track_genres'][j] else j
      to_be_excluded.add(least_informative)
      
to_be_excluded

In [None]:
selected = selected.drop(to_be_excluded, axis=1)

Перекодируем метки классов

In [None]:
genre_le = LabelEncoder()

selected.track_genres = genre_le.fit_transform(selected.track_genres)
selected

In [None]:
selected.columns = selected.columns.map(str)

In [None]:
for column in selected.columns:
  if column == 'track_genres':
    continue
  selected[column] = StandardScaler().fit_transform(selected[column].to_numpy().reshape(-1, 1))

Убедимся, что `StandardScaler` отработал корректно

In [None]:
selected.describe()

In [None]:
selected.to_csv('data/selected.csv')

Разделим данные по принципу `train/test/split`

In [None]:
x = selected.drop('track_genres', axis=1).to_numpy()
y = selected['track_genres'].to_numpy()

test_size = 0.2
valid_size = 0.1

X_train, X_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.2, random_state=69, stratify=y)
    
X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size=valid_size / (1 - test_size),
                     random_state=69, stratify=y_train)
    
n_classes = np.max(y) + 1

In [None]:
models = []

### K-Nearest Neighbours

In [None]:
n_classes = np.max(y) + 1
list_of_neighbours = list(map(int, range(1, 300, 5)))

Опишем функцию, которая будет отображать результаты экспериментов

In [None]:
def plot_score(n, scores, names):
    d = {names: n, 'score': scores}
    df = pd.DataFrame(d)

    sns.set(style='darkgrid')
    sns.lineplot(x=names, y='score', data=df)

Вычислим значения `accuracy` для моделей с разным числом соседей

In [None]:
best_n = -1
best_score = -1
scores = []
for n in tqdm(list_of_neighbours):
    knn = KNeighborsClassifier(p=1, n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score = knn.score(X_test, y_test)
    
    if score > best_score:
        best_score = score
        best_n = n
    scores.append(score)

plot_score(list_of_neighbours, scores, 'neighbors')
print(f'Лучшая модель: {best_n} соседей, точность: {best_score}')

Конечно, такой метод оценки качества модели не является надежным, лучше воспользоваться
оценкой методом кросс-валидации — `cross_val_score`.

В дальнейшем будем использовать именно этот метод.

In [None]:
best_n = -1
best_score = -1
scores = []

for n in tqdm(list_of_neighbours):
    knn = KNeighborsClassifier(p=2, n_neighbors=n)
    
    score = cross_val_score(knn, x, y, cv=5).mean()
    
    if score > best_score:
        best_score = score
        best_n = n
    scores.append(score)

plot_score(list_of_neighbours, scores, 'neighbors')
print(f'Лучшая модель: {best_n} соседей, точность: {best_score}')

models.append((KNeighborsClassifier(p=2, n_neighbors=best_n), 'sklearn'))

In [None]:
scores = []
for n in tqdm(list_of_neighbours):
    knn = KNeighborsClassifier(p=2, n_neighbors=n)
    knn.fit(X_train, y_train)
    probs = knn.predict_proba(X_test)
    
    loss = log_loss(y_test, probs)
    scores.append(loss)

plot_score(list_of_neighbours, scores, 'neighbors')

### SVC

#### C-svc

#### форма функции решения: one vs one

In [None]:
from sklearn import svm


scores = []
ps = list(map(int, range(1, 10, 1)))
for p in tqdm(ps):
    svc = svm.SVC(degree=p, decision_function_shape='ovo')
    svc.fit(X_train, y_train)
    scores.append(svc.score(X_test, y_test))

plot_score(ps, scores, 'degree')

In [None]:
scores = []
Cs = list(map(int, range(1, 100, 1)))

for c in tqdm(Cs):
    svc = svm.SVC(C=c, decision_function_shape='ovo')
    svc.fit(X_train, y_train)
    scores.append(svc.score(X_test, y_test))

plot_score(Cs, scores, 'C-argument')

In [None]:
scores = []
for c in tqdm(Cs):
    svc = svm.SVC(C=c, decision_function_shape='ovo')
    cvs = cross_val_score(svc, x, y, cv=5)
    scores.append(cvs.mean())

plot_score(Cs, scores, 'C-argument')

In [None]:
from sklearn.multiclass import OneVsOneClassifier


scores = []
Cs = list(map(int, range(1, 100, 1)))

for c in tqdm(Cs):
    clf = OneVsOneClassifier(svm.SVC(C=c))
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

plot_score(Cs, scores, 'C-argument')

In [None]:
scores = []

for c in tqdm(Cs):
    clf = OneVsOneClassifier(svm.SVC(C=c))
    cvs = cross_val_score(clf, x, y, cv=5)
    scores.append(cvs.mean())
plot_score(Cs, scores, 'C-argument')

##### форма функции решения: one vs rest

In [None]:
scores = []

for c in tqdm(Cs):
    svc = svm.SVC(C=c, break_ties=True)
    svc.fit(X_train, y_train)
    scores.append(svc.score(X_test, y_test))

plot_score(Cs, scores, 'C-argument')

In [None]:
from sklearn.multiclass import OneVsRestClassifier


scores = []
for c in tqdm(Cs):
    clf = OneVsRestClassifier(svm.SVC(C=c))
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

plot_score(Cs, scores, 'C-argument')

In [None]:
scores = []
Cs = list(map(int, range(1, 50, 1)))
for c in tqdm(Cs):
    clf = OneVsRestClassifier(svm.SVC(C=c))
    cvs = cross_val_score(clf, x, y, cv=5)
    scores.append(cvs.mean())

plot_score(Cs, scores, 'C-argument')

#### $\nu$-svc

In [None]:
scores = []
Nus = np.arange(0.01, 0.5, 0.01)
for nu in tqdm(Nus):
    nu_svc = svm.NuSVC(nu=nu, decision_function_shape='ovo')
    nu_svc.fit(X_train, y_train)
    scores.append(nu_svc.score(X_test, y_test))

plot_score(Nus, scores, 'Nu-argument')

### Нейронные сети

#### Базовая модель

Возьмем в качестве модели многослойный перцептрон, для борьбы с переобучением
воспользуемся слоями `Dropout`. Кроме того, добавим между слоями
нормализацию по подвыборке для того, чтобы сгладить процесс обучения.

В качестве функции активации выберем `leaky_relu`.

In [None]:
from keras.layers import Input, Dropout, Dense, BatchNormalization

clear_session()

lr = 0.001

model = keras.Sequential()
model.add(Input(X_train.shape[1]))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(.3))

model.add(Dense(128, activation='leaky_relu'))
model.add(Dropout(.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='leaky_relu'))
model.add(Dense(8, activation='softmax'))


model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

model.summary()

In [None]:
np.unique(y_train)

In [None]:
nn_y_train = np.array(list(map(label2vec(n_classes), y_train)))
nn_y_test = np.array(list(map(label2vec(n_classes), y_test)))
nn_y_valid = np.array(list(map(label2vec(n_classes), y_valid)))

In [None]:
run = input()

callbacks = [
  TensorBoard(),
  ModelCheckpoint(f'{run}/checkpoint/', save_best_only=True, save_weights_only=True, monitor='categorical_accuracy', verbose=1),
  CSVLogger("logs.csv")
]

In [None]:
model.fit(X_train, nn_y_train, validation_data=(X_valid, nn_y_valid), epochs=200, callbacks=callbacks, batch_size=128)

In [None]:
model.load_weights(f'{run}/checkpoint/')
model.evaluate(X_test, nn_y_test)

In [None]:
models.append((model, 'keras'))

### Сравнение моделей, обученных на мета-данных

In [None]:
for entry in models:
  model_type = entry[1]
  model = entry[0]
  if model_type == 'sklearn':
    model.fit(X_train, y_train)
    results = model.score(X_test, y_test)
  else:
    results = model.evaluate(X_test, nn_y_test)
  print(f'Результат для {model.__class__.__name__}: {results}')

### Сверточная нейронная сеть

In [None]:
from loaders import AudioDataset
from torch.utils.data import DataLoader

In [None]:
selected = pd.read_csv('data/selected.csv')

train = pd.DataFrame(columns=selected.columns)
valid = pd.DataFrame(columns=selected.columns)
test = pd.DataFrame(columns=selected.columns)

train_size = 0.8
valid_size = 0.1

for i in range(0, 8):
  cur = selected[selected['track_genres'] == i]
  
  n = len(cur)
  train = pd.concat([train, cur.iloc[:int(train_size * n)]])
  valid = pd.concat([valid, cur.iloc[int(train_size * n):int((train_size + valid_size) * n)]])
  test = pd.concat([test, cur.iloc[int((train_size + valid_size) * n):]])


In [None]:
test_dataset = AudioDataset(test, suffix='test',
                            sr=44100, win_length=1380, 
                            hop_length=345, data_dir=DATA_DIR)
val_dataset = AudioDataset(valid, suffix='val',
                           sr=44100, win_length=1380, 
                           hop_length=345, data_dir=DATA_DIR)
train_dataset = AudioDataset(train, suffix='train', 
                             sr=44100, win_length=1380, 
                             hop_length=345, data_dir=DATA_DIR)


In [None]:
batch_size = 64
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
empty_cache = None

if torch.cuda.is_available():
  device = torch.device('cuda')
  empty_cache = torch.cuda.empty_cache
elif torch.backends.mps.is_available():
  device = torch.device("mps")
  empty_cache = torch.mps.empty_cache
else:
  device = torch.device("cpu")
  
print(f"Selected device: \"{device}\"")

In [None]:
from torchvision.models import resnet18

model = resnet18()

torch.manual_seed(69)
  
model.fc = torch.nn.Sequential(
  torch.nn.Dropout(p=0.2, inplace=True),
  
  torch.nn.Linear(in_features=512,
                  out_features=256),
  torch.nn.Dropout(p=0.2, inplace=True),
  torch.nn.ReLU(inplace=True),
  
  torch.nn.BatchNorm1d(256),

  torch.nn.Linear(in_features=256,
                  out_features=128),
  torch.nn.Dropout(p=0.2, inplace=True),
  torch.nn.ReLU(inplace=True),
  
  torch.nn.BatchNorm1d(128),
  
  torch.nn.Linear(in_features=128,
                  out_features=32),
  torch.nn.Dropout(p=0.2, inplace=True),
  
  torch.nn.Linear(in_features=32,
                  out_features=8))

ts.summary(model, (3, 640, 480))


In [None]:
run = input()

In [None]:
from utils import save_state, restore_state, DecayingCosineAnnealingLR, \
                  Logger
                  
if device.type != 'cpu':
  empty_cache()

initial_lr = 1e-6
crit = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=initial_lr)

model, opt, last_epoch, loss, acc, best_loss, best_acc, train_progress = \
    restore_state(model, opt, run)

model.to(device)

lr_scheduler = DecayingCosineAnnealingLR(opt, T_max=5,
                                         eta_min=0.0001,
                                         last_epoch=last_epoch)

epochs = 100
n_batches = len(train_loader)
writer = Logger(f"logs/{run}")

for epoch in range(last_epoch + 1, epochs):
  model.train()
  
  print(f'Epoch {epoch + 1}/{epochs}')
  pbar = tf.keras.utils.Progbar(target=n_batches)
  
  lr = lr_scheduler.get_last_lr()[0]
  correct = 0
  samples = 0
  for i, (inputs, targets) in enumerate(train_loader):
    inputs = inputs.to(device)
    
    targets = targets.argmax(dim=1).to(device)
    
    opt.zero_grad()
    outputs = model(inputs)
    
    train_loss = crit(outputs, targets)
    train_loss.backward()
    
    correct += int(torch.sum(outputs.argmax(dim=1) == targets))
    samples += len(targets)
    
    opt.step()
    pbar.update(i, values=[("loss", train_loss.item()),
                           ("acc", correct / samples),
                           ("lr", lr)])
    
    train_loader.dataset.unload()
    
  # lr_scheduler.step()
  model.eval()
  
  train_acc = correct / samples
  
  with torch.no_grad():
    correct = 0
    samples = 0
    for inputs, targets in val_loader:
      inputs = inputs.to(device)
      
      targets = targets.argmax(dim=1)
      targets = targets.to(device)
      
      outputs = model(inputs)
      val_loss = crit(outputs, targets)
      
      correct += int(torch.sum(outputs.argmax(dim=1) == targets))
      samples += len(targets)
  
  val_acc = correct / samples
  val_loss = val_loss.item()
  
  writer.add(train_progress, loss=train_loss.item(), acc=train_acc,
                             val_loss=val_loss, val_acc=val_acc,
                             epoch=epoch, lr=lr)
  
  save_state(model, opt, epoch, train_progress, loss, acc, best_loss, best_acc, run)
   
  if val_loss < loss:
    best_loss = val_loss
    torch.save(model.state_dict(), f'{run}/best_loss.pt')
    
  if val_acc > acc:
    best_acc = val_acc
    torch.save(model.state_dict(), f'{run}/best_acc.pt')
      
  pbar.update(n_batches, values=[("val_loss", val_loss), 
                                 ("val_acc", correct / samples)])
  
model.load_state_dict(f'{run}/best_acc.pt')
torch.save(torch.jit.script(model), f'{run}/best_model_acc.pt')
model.load_state_dict(f'{run}/best_loss.pt')
torch.save(torch.jit.script(model), f'{run}/best_model_loss.pt')

In [None]:
from loaders import retrieve_image, get_audio_by_id

In [None]:
model = torch.jit.load(f'{run}/best_model_loss.pt', map_location=device)
model.to(device)
model.eval()
batch_size = 256

probs = [[] for _ in range(8)]

rows = list(selected.iterrows())

for i in range(0, selected.shape[0], batch_size):
  print(f"Started processing batch {i // batch_size} of {selected.shape[0] // batch_size}")
  
  batch = []
  for index, row in tqdm(rows[i: i + batch_size], total=batch_size):
    try:
      audio = get_audio_by_id(DATA_DIR, index)
      image = retrieve_image(audio, sr=44100, win_length=1380, hop_length=345,
                                    n_fft=2048, fmin=50, fmax=14000)
    except Exception:
      image = np.zeros((480, 640, 3))
    
    image = np.swapaxes(image, 0, 2)
    
    batch.append(image)
    
  batch = np.array(batch, dtype=np.float32)
  model.eval()
  with torch.no_grad():
    inputs = torch.tensor(batch, device=device)
    inputs.to(device)
    outputs = model(inputs)
    result = torch.nn.Softmax()(outputs)
    for b in range(batch_size):
      for i in range(result.size(dim=1)):
        probs[i].append(result[b][i].cpu().numpy())
    
for i in range(8):
  selected[str(i)] = probs[i]