# Жанровая классификация

In [83]:
import numpy as np
import pandas as pd
import seaborn as sns
import keras
import matplotlib.pyplot as plt
import librosa
import tensorflow as tf

In [84]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## Фильтрация метаданных

In [85]:
import glob


DATA_DIR = './data/fma_small/'
METADATA_DIR = './data/fma_metadata/'

mp3_files = glob.glob(DATA_DIR + '*/*.mp3')
mp3_names = list(map(lambda f: np.int64(f.split('/')[-1].split('.')[0]), mp3_files))

raw_tracks = pd.read_csv(METADATA_DIR + 'raw_tracks.csv')
tracks = raw_tracks[raw_tracks['track_id'].isin(mp3_names)]

tracks

Unnamed: 0,track_id,album_id,album_title,album_url,artist_id,artist_name,artist_url,artist_website,license_image_file,license_image_file_large,...,track_information,track_instrumental,track_interest,track_language_code,track_listens,track_lyricist,track_number,track_publisher,track_title,track_url
0,2,1.0,AWOL - A Way Of Life,http://freemusicarchive.org/music/AWOL/AWOL_-_...,1,AWOL,http://freemusicarchive.org/music/AWOL/,http://www.AzillionRecords.blogspot.com,http://i.creativecommons.org/l/by-nc-sa/3.0/us...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,4656,en,1293,,3,,Food,http://freemusicarchive.org/music/AWOL/AWOL_-_...
2,5,1.0,AWOL - A Way Of Life,http://freemusicarchive.org/music/AWOL/AWOL_-_...,1,AWOL,http://freemusicarchive.org/music/AWOL/,http://www.AzillionRecords.blogspot.com,http://i.creativecommons.org/l/by-nc-sa/3.0/us...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,1933,en,1151,,6,,This World,http://freemusicarchive.org/music/AWOL/AWOL_-_...
3,10,6.0,Constant Hitmaker,http://freemusicarchive.org/music/Kurt_Vile/Co...,6,Kurt Vile,http://freemusicarchive.org/music/Kurt_Vile/,http://kurtvile.com,http://i.creativecommons.org/l/by-nc-nd/3.0/88...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,54881,en,50135,,1,,Freeway,http://freemusicarchive.org/music/Kurt_Vile/Co...
15,140,61.0,The Blind Spot,http://freemusicarchive.org/music/Alec_K_Redfe...,54,Alec K. Redfearn & the Eyesores,http://freemusicarchive.org/music/Alec_K_Redfe...,http://www.aleckredfearn.com,http://i.creativecommons.org/l/by-nc-nd/3.0/us...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,1593,en,1299,,2,,Queen Of The Wires,http://freemusicarchive.org/music/Alec_K_Redfe...
16,141,60.0,Every Man For Himself,http://freemusicarchive.org/music/Alec_K_Redfe...,54,Alec K. Redfearn & the Eyesores,http://freemusicarchive.org/music/Alec_K_Redfe...,http://www.aleckredfearn.com,http://i.creativecommons.org/l/by-nc-nd/3.0/us...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,839,en,725,,4,,Ohio,http://freemusicarchive.org/music/Alec_K_Redfe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108843,154308,22780.0,Journey,http://freemusicarchive.org/music/Fleslit/Jour...,23208,Fleslit,http://freemusicarchive.org/music/Fleslit/,https://soundcloud.com/fleslit,http://i.creativecommons.org/l/by/4.0/88x31.png,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,3371,,2705,,17,,MIA,http://freemusicarchive.org/music/Fleslit/Jour...
108844,154309,22780.0,Journey,http://freemusicarchive.org/music/Fleslit/Jour...,23208,Fleslit,http://freemusicarchive.org/music/Fleslit/,https://soundcloud.com/fleslit,http://i.creativecommons.org/l/by/4.0/88x31.png,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,4525,,3589,,18,,A1 Symphony,http://freemusicarchive.org/music/Fleslit/Jour...
108945,154413,22789.0,Live at WFMU for Dark Night of the Soul wtih J...,http://freemusicarchive.org/music/Tasseomancy/...,24252,Tasseomancy,http://freemusicarchive.org/music/Tasseomancy/,https://tasseomancy.bandcamp.com/,http://i.creativecommons.org/l/by-nc-nd/4.0/88...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,809,,676,,9,,Do Easy,http://freemusicarchive.org/music/Tasseomancy/...
108946,154414,22789.0,Live at WFMU for Dark Night of the Soul wtih J...,http://freemusicarchive.org/music/Tasseomancy/...,24252,Tasseomancy,http://freemusicarchive.org/music/Tasseomancy/,https://tasseomancy.bandcamp.com/,http://i.creativecommons.org/l/by-nc-nd/4.0/88...,http://fma-files.s3.amazonaws.com/resources/im...,...,,0,851,,788,,10,,Dead Can Dance (uncensored),http://freemusicarchive.org/music/Tasseomancy/...


## Сбор признаков, полученных с помощью `librosa`

In [86]:
features_df = pd.read_csv(METADATA_DIR + 'features.csv', index_col=0, header=[0, 1, 2])
features_df = features_df[features_df.index.isin(mp3_names)]

features = np.unique(list(map(lambda x: x[0], list(features_df.columns))))

print(f"Features available: {features}")
print(f"Total: {len(features)}")

features_df

Features available: ['chroma_cens' 'chroma_cqt' 'chroma_stft' 'mfcc' 'rmse'
 'spectral_bandwidth' 'spectral_centroid' 'spectral_contrast'
 'spectral_rolloff' 'tonnetz' 'zcr']
Total: 11


feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
140,0.533579,-0.623885,-1.086205,-1.081079,-0.765151,-0.072282,-0.882913,-0.582376,-0.884749,-0.645214,...,0.157683,0.028070,0.025946,11.052547,0.379395,0.052379,0.036621,0.001953,3.143968,0.057712
141,0.172898,-0.284804,-1.169662,-1.062855,-0.706868,-0.708281,-0.204884,0.023624,-0.642770,-0.786291,...,0.145994,0.024342,0.032111,32.994659,0.415527,0.040267,0.034668,0.002930,4.204097,0.028665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154308,-0.677709,-0.830608,-0.686122,0.185158,2.854806,-1.131797,-1.161222,-1.110237,-0.818111,-1.388484,...,0.117287,0.021906,0.021999,53.807804,0.769043,0.042997,0.022949,0.001953,6.928385,0.084298
154309,-0.585059,-0.635075,-0.194742,-0.434809,-1.357310,-0.257745,-0.639082,-0.572640,-1.225242,5.548715,...,0.180397,0.029303,0.023771,12.597441,0.768555,0.082979,0.037109,0.008789,3.441251,0.129480
154413,-0.214509,-1.130469,0.718534,-0.368448,-0.147830,-0.099409,-1.325709,-0.105248,-1.363881,1.229534,...,0.156587,0.036926,0.038113,25.368595,0.323242,0.024532,0.018066,0.000977,3.736646,0.023821
154414,-0.487371,-0.923754,-0.283099,-0.435221,-1.137329,-0.798039,-0.258168,1.004049,-0.499121,0.746973,...,0.181294,0.026723,0.030980,21.276468,0.511230,0.046116,0.033691,0.003418,3.997052,0.045733


## Отбор признаков

Рассмотрим всю имеющуюся информацию о треках

In [87]:
tracks.columns

Index(['track_id', 'album_id', 'album_title', 'album_url', 'artist_id',
       'artist_name', 'artist_url', 'artist_website', 'license_image_file',
       'license_image_file_large', 'license_parent_id', 'license_title',
       'license_url', 'tags', 'track_bit_rate', 'track_comments',
       'track_composer', 'track_copyright_c', 'track_copyright_p',
       'track_date_created', 'track_date_recorded', 'track_disc_number',
       'track_duration', 'track_explicit', 'track_explicit_notes',
       'track_favorites', 'track_file', 'track_genres', 'track_image_file',
       'track_information', 'track_instrumental', 'track_interest',
       'track_language_code', 'track_listens', 'track_lyricist',
       'track_number', 'track_publisher', 'track_title', 'track_url'],
      dtype='object')

Оценим число непустых значений тегов

In [88]:
tracks['tags'].map(lambda x: None if x == '[]' else x).notnull().value_counts()

False    6639
True     1361
Name: tags, dtype: int64

Подсчитаем число уникальных тегов

In [89]:
from functools import reduce


unique_tags = reduce(lambda tags, l: tags.union(eval(l)), tracks['tags'], set())  
print(len(unique_tags))

1191


Оставим предположительно полезную информацию из набора данных. Убедимся
в её необходимости позже.

In [90]:
to_keep = [
  'track_id', "album_id", "artist_id", "track_duration", 
  "track_genres", "track_instrumental", "track_interest", "track_listens",
]

filtered_tracks = tracks[to_keep]
filtered_tracks

Unnamed: 0,track_id,album_id,artist_id,track_duration,track_genres,track_instrumental,track_interest,track_listens
0,2,1.0,1,02:48,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,4656,1293
2,5,1.0,1,03:26,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,1933,1151
3,10,6.0,6,02:41,"[{'genre_id': '10', 'genre_title': 'Pop', 'gen...",0,54881,50135
15,140,61.0,54,04:13,"[{'genre_id': '17', 'genre_title': 'Folk', 'ge...",0,1593,1299
16,141,60.0,54,03:02,"[{'genre_id': '17', 'genre_title': 'Folk', 'ge...",0,839,725
...,...,...,...,...,...,...,...,...
108843,154308,22780.0,23208,03:14,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,3371,2705
108844,154309,22780.0,23208,02:42,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,4525,3589
108945,154413,22789.0,24252,05:49,"[{'genre_id': '76', 'genre_title': 'Experiment...",0,809,676
108946,154414,22789.0,24252,05:46,"[{'genre_id': '76', 'genre_title': 'Experiment...",0,851,788


Преобразуем время в секунды

In [91]:
def duration_to_int(t):
  splitted = t.split(":")
  
  return int(splitted[0]) * 60 + int(splitted[1])

filtered_tracks.loc[:,'track_duration'] = filtered_tracks.track_duration.apply(duration_to_int)
filtered_tracks

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tracks.loc[:,'track_duration'] = filtered_tracks.track_duration.apply(duration_to_int)
  filtered_tracks.loc[:,'track_duration'] = filtered_tracks.track_duration.apply(duration_to_int)


Unnamed: 0,track_id,album_id,artist_id,track_duration,track_genres,track_instrumental,track_interest,track_listens
0,2,1.0,1,168,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,4656,1293
2,5,1.0,1,206,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,1933,1151
3,10,6.0,6,161,"[{'genre_id': '10', 'genre_title': 'Pop', 'gen...",0,54881,50135
15,140,61.0,54,253,"[{'genre_id': '17', 'genre_title': 'Folk', 'ge...",0,1593,1299
16,141,60.0,54,182,"[{'genre_id': '17', 'genre_title': 'Folk', 'ge...",0,839,725
...,...,...,...,...,...,...,...,...
108843,154308,22780.0,23208,194,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,3371,2705
108844,154309,22780.0,23208,162,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ...",0,4525,3589
108945,154413,22789.0,24252,349,"[{'genre_id': '76', 'genre_title': 'Experiment...",0,809,676
108946,154414,22789.0,24252,346,"[{'genre_id': '76', 'genre_title': 'Experiment...",0,851,788


Узнаем количество жанров для треков

In [92]:
import json


genres = filtered_tracks['track_genres'].map(lambda x: json.loads(x.replace("'", "\"")))
genre_ids = genres.map(lambda x: list(map(lambda y: y['genre_id'], x)))
genre_ids.map(lambda x: len(x)).value_counts()

1    4256
2    2362
3    1304
4      41
5      32
6       5
Name: track_genres, dtype: int64

Определим базовые жанры для каждого трека

In [93]:
all_genres = pd.read_csv(METADATA_DIR + 'genres.csv')

base_genres = genre_ids.map(lambda x: all_genres[all_genres.genre_id == int(x[0])].iloc[0].top_level)

filtered_tracks['track_genres'] = base_genres
filtered_tracks

In [None]:
base_genres.value_counts()

Получили 8 сбалансированных классов

In [None]:
import seaborn as sns

def display_corr(df):
  corr = df.corr()
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  mask = np.triu(np.ones_like(corr, dtype=bool))
  sns.heatmap(corr, mask=mask, cmap=cmap)
  
display_corr(filtered_tracks)

Жанр трека очень плохо коррелирует с его длительностью, поэтому исключим
этот признак из рассмотрения

In [None]:
filtered_tracks = filtered_tracks.drop('track_duration', axis=1)

Теперь добавим значения, предпосчитанные с помощью `librosa`

In [None]:
merged = features_df.merge(filtered_tracks, how='inner', on='track_id')

display_corr(merged)

Конечно, признаков слишком много. Из всех возьмем признаки с наибольшей по
модулю корреляцией.

Для этого отсортируем признаки по степени корреляции

In [None]:
correlation = merged.corr()

genres_corr = correlation['track_genres'].sort_values(key=lambda x: np.abs(x), ascending=False)
genres_corr

Изобразим распределение значений корреляции

In [None]:
sns.histplot(genres_corr)

Видно, что наибольшее число признаков имеют почти нулевую корреляцию.
В связи с этим выберем наиболее информативные из них

In [None]:
BOUNDARY = 0.2

selected = merged[genres_corr[abs(genres_corr) > BOUNDARY].reset_index()['index']]
selected

Перекодируем метки классов

In [None]:
from sklearn.preprocessing import LabelEncoder


genre_le = LabelEncoder()

selected.track_genres = genre_le.fit_transform(selected.track_genres)
selected

In [None]:
selected.columns = selected.columns.map(str)

In [None]:
from sklearn.preprocessing import StandardScaler

for column in selected.columns:
  if column == 'track_genres':
    continue
  selected[column] = StandardScaler().fit_transform(selected[column].to_numpy().reshape(-1, 1))

Убедимся, что `StandardScaler` отработал корректно

In [None]:
selected.describe()

## KNN

In [None]:
from sklearn.model_selection import train_test_split


X = selected.drop('track_genres', axis=1)
y = selected['track_genres']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=69)

In [None]:
def plot_score(n, scores):
    d = {'neighbors': n, 'score': scores}
    df = pd.DataFrame(d)

    sns.set(style='darkgrid')
    sns.lineplot(x='neighbors', y='score', data=df)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm


ns = list(map(int, range(1, 100, 1)))
scores = []
for n in tqdm(ns):
    knn = KNeighborsClassifier(p=1, n_neighbors=n)
    knn.fit(X_train, y_train)
    Y_pred = knn.predict(X_test)
    scores.append(knn.score(X_test, y_test))

plot_score(ns, scores)

In [None]:
from sklearn.model_selection import cross_val_score


scores = []

for n in tqdm(ns):
    knn = KNeighborsClassifier(p=1, n_neighbors=n)
    scores.append(cross_val_score(knn, X, y, cv=5).mean())

plot_score(ns, scores)

In [None]:
from sklearn.metrics import log_loss


scores = []
for n in tqdm(ns):
    knn = KNeighborsClassifier(p=1, n_neighbors=n)
    knn.fit(X_train, y_train)
    d2_pred = knn.predict_proba(X_test)
    max_ind = np.argmax(d2_pred, axis=1)
    d2_pred.fill(0)
    for ind in range(0, len(max_ind)):
        d2_pred[ind][max_ind[ind]] = 1
    logl = log_loss(y_test, d2_pred, eps= 1e-10)
    # print(logl)
    scores.append(logl)

plot_score(ns, scores)

In [None]:
scores = []
for n in tqdm(ns):
    knn = KNeighborsClassifier(p=1, n_neighbors=n)
    cvp = cross_val_score(knn, X, y, cv=5, scoring='neg_log_loss')
    # print(cvp)
    scores.append(cvp.mean()) 
    # print(scores)
    
plot_score(ns, scores)