In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("maltegrosse/8-m-spotify-tracks-genre-audio-features")

print("Path to dataset files:", path)

Path to dataset files: /Users/ilya/.cache/kagglehub/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features/versions/1


In [2]:
import pandas as pd
import sqlite3

db_path = path + "/spotify.sqlite"

In [3]:
con = sqlite3.connect(db_path)
con.text_factory = lambda x: x.decode(encoding='utf-8',errors = 'ignore')
cur = con.cursor()

cur.execute("SELECT name FROM sqlite_master  WHERE type='table';")
print(cur.fetchall())

[('albums',), ('artists',), ('audio_features',), ('genres',), ('r_albums_artists',), ('r_albums_tracks',), ('r_artist_genre',), ('r_track_artist',), ('tracks',)]


### Загрузка данных из базы Kaggle. Можно пропустить, если есть сохраненный файл.

In [None]:
rAlbumsTracks = pd.read_sql_query("SELECT * FROM r_albums_tracks", con)
albums = pd.read_sql_query("SELECT * from albums", con)
artists = pd.read_sql_query("SELECT * from artists", con)
audio_features = pd.read_sql_query("SELECT * from audio_features", con)
tracks = pd.read_sql_query("SELECT * from tracks", con)
rAlbumsArtists = pd.read_sql_query("SELECT * from r_albums_artists", con)

In [None]:
albums = albums.sort_values(by='popularity',ascending=False)
albums_merge_artists = pd.merge(rAlbumsArtists,albums,left_on='album_id',right_on='id')
albums_merge_artists.drop(['id'],axis=1,inplace=True)
albums_merge_artists.sort_values(by='popularity',ascending=False,inplace=True)
albums_merge_artists.rename(columns={'name':'albumName'},inplace=True)
albums_merge_artists.drop_duplicates(subset=['albumName','artist_id'],inplace=True)

MergedData = pd.merge(rAlbumsTracks,albums_merge_artists[albums_merge_artists['album_type']!='single'])
MergedData = pd.merge(MergedData,audio_features,left_on='track_id',right_on='id')
MergedData = MergedData.drop(['id'],axis=1)
MergedData = pd.merge(MergedData,artists[['name','id']],left_on='artist_id',right_on='id')
MergedData = MergedData.drop(['id'],axis=1)
MergedData = MergedData.rename(columns={'name':'artistName'})
MergedData = MergedData.sort_values(by=['popularity'],ascending=False)

In [None]:
MergedData = MergedData[MergedData['release_date'] > -3000000000000]
MergedData['release_year'] = pd.to_datetime(MergedData['release_date'], unit='ms').dt.year

In [None]:
import pickle

def storeData(df):
    dbfile = open('MergedData','ab')
    pickle.dump(df, dbfile)
    dbfile.close()
storeData(MergedData)

### Загрузка данных из сохраненного файла.

In [42]:
import pickle
    
def loadData():
    # for reading also binary mode is important
    dbfile = open('MergedData', 'rb')
    db = pickle.load(dbfile)
    dbfile.close()
    return db

In [43]:
MergedData = loadData()

In [44]:
MergedData.shape

(9182239, 24)

### Кодируем категориальные признаки и пробуем порекомендовать 

Мой ноут долго подгружает все-все данные, поэтому для эксперимента ниже захвачу только пару миллионов строк.

In [45]:
MergedData = MergedData.sample(int(2e6), random_state=42)

In [46]:
MergedData.shape

(2000000, 24)

In [47]:
rArtistGenre = pd.read_sql_query("SELECT * FROM r_artist_genre", con)
merged_generes_data = pd.merge(MergedData, rArtistGenre, on="artist_id", how="left")
known_genres = ['pop', 'rock', 'metal', 'rap', 'hip hop', 'classic', 'alt', 'jazz', 'indie']
def assign_main_genre(row):
    if isinstance(row['genre_id'], str):
        for genre in known_genres:
            if genre in row['genre_id']:
                return genre
    return 'others'

merged_generes_data['main_genre'] = merged_generes_data.apply(assign_main_genre, axis=1)

In [48]:
MergedData.shape

(2000000, 24)

In [49]:
merged_generes_data.nunique()

album_id             484394
track_id            1898677
artist_id            283802
albumName            430304
album_group               1
album_type                2
release_date          16867
popularity               99
acousticness           5398
analysis_url        1898677
danceability           1375
duration             296554
energy                 3608
instrumentalness       5402
key                      12
liveness               1828
loudness              41728
mode                      2
speechiness            1658
tempo                149910
time_signature            5
valence                2491
artistName           278734
release_year            118
genre_id               5458
main_genre               10
dtype: int64

Просто обалдеть, как много айди артистов и жанров. В первой версии закодируем артистов таким образом, чтоб они выступали в одном стиле - понятно, что это далеко не лучшая идея, но именно она позволяет в первом приближении протестировать бейзлайн. В дальнейшем нужно выгрузить из спотифая много плейлистов и посмотреть на состав артистов в нем. То же применимо и к жанрам из расширенного списка.

In [50]:
artist_playlists = []

for _, album_df in merged_generes_data.groupby('genre_id'):
    artist_playlists.append(list(album_df['artist_id'].unique()))


In [51]:
from gensim.models import Word2Vec
vector_size = 200
artist_model = Word2Vec(sentences=artist_playlists, vector_size=vector_size, window=5, min_count=1, sg=1)

In [52]:
merged_generes_data[merged_generes_data['artist_id'] == 
                   artist_model.wv.most_similar('1uNFoZAHBGtllmzznpCI3s')[0][0] # Justin Bieber
                   ]['artistName'].head(1) 

144101    Markus Johansson
Name: artistName, dtype: object

Размер вектора надо еще крутить, но пока попробуем так.

In [53]:
import numpy as np

artist_embedding = MergedData['artist_id'].apply(
    lambda x: pd.Series(artist_model.wv[x]) if x in artist_model.wv else pd.Series(np.nan * np.zeros(vector_size)))
artist_embedding.columns = [f'artist_embedding_{e}' for e in range(vector_size)]

In [54]:
artist_embedding.head()

Unnamed: 0,artist_embedding_0,artist_embedding_1,artist_embedding_2,artist_embedding_3,artist_embedding_4,artist_embedding_5,artist_embedding_6,artist_embedding_7,artist_embedding_8,artist_embedding_9,...,artist_embedding_190,artist_embedding_191,artist_embedding_192,artist_embedding_193,artist_embedding_194,artist_embedding_195,artist_embedding_196,artist_embedding_197,artist_embedding_198,artist_embedding_199
6465512,,,,,,,,,,,...,,,,,,,,,,
7648496,,,,,,,,,,,...,,,,,,,,,,
463458,,,,,,,,,,,...,,,,,,,,,,
4955776,,,,,,,,,,,...,,,,,,,,,,
7011397,-0.004242,0.000487,-0.003103,-0.001952,-0.000713,-0.002131,0.000788,0.001003,0.002345,-0.004067,...,0.00467,-0.004072,0.003893,0.003603,0.004268,-0.001558,-0.001455,0.004816,-0.00277,-0.004483


In [55]:
genres = {}

for track_id, track_df in merged_generes_data.groupby('track_id'):
    genres[track_id] = dict.fromkeys(known_genres + ['others'], 0)
    for genre_id in list(track_df['main_genre'].unique()):
        if genre_id in genres[track_id]:
            genres[track_id][genre_id] = 1
genre_df = pd.DataFrame(genres).T

In [56]:
genre_df.head()

Unnamed: 0,pop,rock,metal,rap,hip hop,classic,alt,jazz,indie,others
0000pHgyPFaekcPYkSiZ5T,0,0,0,0,0,0,0,0,0,1
0000uJA4xCdxThagdLkkLR,0,0,0,0,0,0,0,0,0,1
0002UmhGH9V9BiMN9lyMHW,0,0,0,0,0,0,0,0,0,1
0006bYz7i78jMS4hiuGeJe,0,0,0,0,0,0,0,0,0,1
00075E86IyrXpuZcdibi60,0,0,0,0,0,0,0,0,0,1


In [57]:
MergedData = pd.concat([
    MergedData,
    artist_embedding,
        pd.merge(MergedData.reset_index(), 
         genre_df.reset_index().rename({'index': 'track_id'}, axis=1),
         on=['track_id'], how='left').set_index('index')[genre_df.columns],
    pd.get_dummies(MergedData['time_signature'], prefix='time_signature').astype(int),
    pd.get_dummies(MergedData['key'], prefix='key').astype(int),
    pd.get_dummies(MergedData['mode'], prefix='mode').astype(int),
    
], axis=1).dropna()

In [58]:
MergedData.shape

(1298012, 253)

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def content_based_recommendation(data, track_id, n=10):
    # Уберем лишние признаки, которые не будем использовать
    features = data.drop(
        ['album_id', 'track_id', 'artist_id', 'albumName', 'album_group', 
         'release_date', 'analysis_url', 'artistName', 'album_type',
         'key', 'mode', 'time_signature',
        ],
        axis=1,
    ).columns
    
    scaler = StandardScaler()
    
    # Нормализуем только нужные нам признаки
    data[features] = scaler.fit_transform(data[features])
    
    # Находим трек, похожий на track_id c помощью косинусного расстояния
    track_features = data[data['track_id'] == track_id][features].values
    all_features = data[features].values

    similarities = cosine_similarity(track_features, all_features).flatten()

    similar_indices = similarities.argsort()[-n-1:-1][::-1]
    similar_tracks = data.iloc[similar_indices]
    
    return similar_tracks


Возьмем в качестве примера какой-нибудь трек какого-нибудь популярного артиста, чтобы легко было угадать рекомендации. Попробуем The Weeknd https://open.spotify.com/track/6kWxIqQDsKFYCJGbU4AjCX.

In [60]:
pd.set_option('display.max_rows', 500)
MergedData[MergedData['track_id'] == '6kWxIqQDsKFYCJGbU4AjCX'].T

Unnamed: 0,2114509
album_id,4yP0hdKOZPNshxUOjY0cZj
track_id,6kWxIqQDsKFYCJGbU4AjCX
artist_id,1Xyo4u8uXC1ZmMpatF05PJ
albumName,After Hours
album_group,
album_type,album
release_date,1584662400000
popularity,97
acousticness,0.0909
analysis_url,https://api.spotify.com/v1/audio-analysis/6kWx...


In [61]:
similar_tracks = content_based_recommendation(MergedData.copy(), '6kWxIqQDsKFYCJGbU4AjCX', 100) 

In [62]:
similar_tracks[['track_id', 'artistName']]

Unnamed: 0,track_id,artistName
2114513,6bnF93Rx87YqUBLSgjiMU8,The Weeknd
2114510,4BGZF4oLbTL0pWm7C18pbv,The Weeknd
2114518,40U8d12pC5UHqmHwXjHjjl,The Weeknd
2114470,1gZADNt16Oh23jWyMYRk4p,The Weeknd
2114472,7MXVkk9YMctZqd1Srtv4MB,The Weeknd
2114489,3dhjNA0jGA8vHBQ1VdD6vV,The Weeknd
2114506,5oAOK7xCJD8hzp9PuxjULL,The Weeknd
2114448,7eZ7ODAt9cALYnWLO8F6Fd,The Weeknd
2114458,03j354P848KtNU2FVSwkDG,The Weeknd
2114476,36YCdzT57us0LhDmCYtrNE,The Weeknd


Мне кажется, в данном случае получилось очень даже ничего, если нас пока устраивает, что нам предлагается поп/рэп/диско с запросом на поп/рэп/диско.