In [2]:
import pandas as pd
import sqlite3
import pickle
import numpy as np

In [3]:
db_path = "/Users/akovel/Documents/HSE/Music-Predictor/data" + "/MergedData"

In [4]:
def loadData():
    # for reading also binary mode is important
    dbfile = open(db_path, 'rb')
    db = pickle.load(dbfile)
    dbfile.close()
    return db

MergedData = loadData()

In [5]:
MergedData.shape


(19308111, 25)

In [6]:
known_genres = ['pop', 'rock', 'metal', 'rap', 'hip hop', 'classic', 'alt', 'jazz', 'indie']
def assign_main_genre(row):
    if isinstance(row['genre_id'], str):
        for genre in known_genres:
            if genre in row['genre_id']:
                return genre
    return 'others'

MergedData['main_genre'] = MergedData.apply(assign_main_genre, axis=1)

In [7]:
MergedData.nunique()

album_id             513919
track_id            7906982
artist_id            300929
albumName            454545
album_group               1
album_type                2
release_date          17169
popularity               99
acousticness           5398
analysis_url        7906982
danceability           1427
duration             497552
energy                 4036
instrumentalness       5402
key                      12
liveness               1897
loudness              47351
mode                      2
speechiness            1660
tempo                170659
time_signature            5
valence                2906
genre_id               5461
artistName           295312
release_year            118
main_genre               10
dtype: int64

In [8]:
MergedData.select_dtypes(include=['number'])


Unnamed: 0,release_date,popularity,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,release_year
5128108,1616112000000,100,0.127000,0.595,174406,0.543,0.000000,11,0.0975,-8.149000,1,0.0380,99.928001,4,0.109,2021
5128126,1616112000000,100,0.010400,0.658,170813,0.634,0.000000,1,0.1200,-6.068000,0,0.0431,140.001999,4,0.302,2021
5128129,1616112000000,100,0.000284,0.551,179415,0.756,0.000095,2,0.1180,-7.679000,1,0.0450,75.990997,4,0.359,2021
5128130,1616112000000,100,0.000284,0.551,179415,0.756,0.000095,2,0.1180,-7.679000,1,0.0450,75.990997,4,0.359,2021
5128131,1616112000000,100,0.185000,0.601,153190,0.741,0.000029,2,0.4150,-5.569000,1,0.0478,153.960007,4,0.441,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12821374,1391040000000,0,0.940000,0.631,189307,0.157,0.000144,6,0.2270,-12.882000,1,0.2750,140.503006,4,0.476,2014
12821373,1391040000000,0,0.940000,0.631,189307,0.157,0.000144,6,0.2270,-12.882000,1,0.2750,140.503006,4,0.476,2014
12821372,1391040000000,0,0.940000,0.631,189307,0.157,0.000144,6,0.2270,-12.882000,1,0.2750,140.503006,4,0.476,2014
12821371,1391040000000,0,0.200000,0.765,302387,0.347,0.166000,9,0.1020,-16.474001,1,0.2650,120.860001,4,0.586,2014


# Найдем просто так 10 ближайших

In [12]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [17]:
feature_columns = MergedData.select_dtypes(include=['number']).columns
features = MergedData[feature_columns]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

features_with_ids = pd.DataFrame(features_scaled, columns=feature_columns)
features_with_ids['artistName'] = MergedData['artistName']
print(features_with_ids.head())


   release_date  popularity  acousticness  danceability  duration    energy  \
0      1.126801    4.448192     -0.904818      0.458528 -0.410392  0.122939   
1      1.126801    4.448192     -1.206978      0.785093 -0.432916  0.433073   
2      1.126801    4.448192     -1.233192      0.230451 -0.378991  0.848856   
3      1.126801    4.448192     -1.233192      0.230451 -0.378991  0.848856   
4      1.126801    4.448192     -0.754515      0.489630 -0.543392  0.797735   

   instrumentalness       key  liveness  loudness      mode  speechiness  \
0         -0.693441  1.640752 -0.612527  0.547619  0.696270    -0.407061   
1         -0.693441 -1.196331 -0.498481  0.850418 -1.436225    -0.367354   
2         -0.693182 -0.912622 -0.508618  0.616007  0.696270    -0.352561   
3         -0.693182 -0.912622 -0.508618  0.616007  0.696270    -0.352561   
4         -0.693362 -0.912622  0.996790  0.923025  0.696270    -0.330761   

      tempo  time_signature   valence  release_year   artistName  
0

In [20]:
target_artist_id = "Metallica"

target_artist_features = features_with_ids[features_with_ids['artistName'] == target_artist_id].drop(columns=['artistName'])

nbrs = NearestNeighbors(n_neighbors=11)
nbrs.fit(features_scaled)

distances, indices = nbrs.kneighbors(target_artist_features)

similar_artist_ids = features_with_ids.iloc[indices[0][1:]]['artistName'].values

similar_artists = MergedData[MergedData['artistName'].isin(similar_artist_ids)]

print("Top 10 most similar artists:")
print(similar_artists[['artistName', 'popularity', 'energy', 'danceability']])



Top 10 most similar artists:
           artistName   artistName  popularity  energy  danceability
95201     Chuck Berry  Chuck Berry          72   0.141         0.435
95200     Chuck Berry  Chuck Berry          72   0.141         0.435
95108     Chuck Berry  Chuck Berry          72   0.645         0.528
95110     Chuck Berry  Chuck Berry          72   0.645         0.528
95111     Chuck Berry  Chuck Berry          72   0.645         0.528
...               ...          ...         ...     ...           ...
12820642  Chuck Berry  Chuck Berry           0   0.472         0.357
12820641  Chuck Berry  Chuck Berry           0   0.315         0.639
12820640  Chuck Berry  Chuck Berry           0   0.315         0.639
12820639  Chuck Berry  Chuck Berry           0   0.315         0.639
12820638  Chuck Berry  Chuck Berry           0   0.315         0.639

[11318 rows x 5 columns]


## Ну... Тут ничего не попишешь. Давайте искать внутри жанра

In [24]:
target_artist_name = "Metallica"
target_artist_data = MergedData[MergedData['artistName'] == target_artist_name]

target_genre = target_artist_data['main_genre'].values[0]
print(set(target_artist_data['main_genre'].values))

genre_artists_data = MergedData[MergedData['main_genre'] == target_genre]

genre_features = genre_artists_data[feature_columns]

scaler = StandardScaler()
genre_features_scaled = scaler.fit_transform(genre_features)

nbrs = NearestNeighbors(n_neighbors=11)  
nbrs.fit(genre_features_scaled)

target_artist_features = target_artist_data[feature_columns]
target_artist_scaled = scaler.transform(target_artist_features)

distances, indices = nbrs.kneighbors(target_artist_scaled)

similar_artists_indices = indices[0][1:]  
similar_artists = genre_artists_data.iloc[similar_artists_indices]

print(f"Top 10 most similar artists to {target_artist_name} in genre '{target_genre}':")
print(similar_artists[['artist_id', 'artistName', 'popularity', 'energy', 'danceability']])

{'others', 'rock', 'metal'}
Top 10 most similar artists to Metallica in genre 'others':
                      artist_id    artistName  popularity  energy  \
4865920  2ye2Wgw4gimLv2eAKyk1NB     Metallica          82   0.798   
404355   51Blml2LZPmy7TTiAg47vQ            U2          74   0.843   
452362   0EdvGhlC1FkGItLOWQzG4J       Sublime          78   0.769   
452361   0EdvGhlC1FkGItLOWQzG4J       Sublime          78   0.769   
452360   0EdvGhlC1FkGItLOWQzG4J       Sublime          78   0.769   
362517   0kObWap02DEg9EAJ3PBxzf      Starship          71   0.802   
362518   0kObWap02DEg9EAJ3PBxzf      Starship          71   0.802   
362516   0kObWap02DEg9EAJ3PBxzf      Starship          71   0.802   
431956   3jOstUTkEu2JkjvRdBA5Gu        Weezer          76   0.729   
340414   6hN9F0iuULZYWXppob22Aj  Simple Minds          73   0.860   

         danceability  
4865920         0.536  
404355          0.635  
452362          0.664  
452361          0.664  
452360          0.664  
362517  

### Забавный факт номер три. Теперь металлика не металл. Давайте сделаем жетсе

In [26]:
target_artist_name = "Metallica"
target_artist_data = MergedData[MergedData['artistName'] == target_artist_name]

target_genre = "metal"

genre_artists_data = MergedData[MergedData['main_genre'] == target_genre]

genre_features = genre_artists_data[feature_columns]

scaler = StandardScaler()
genre_features_scaled = scaler.fit_transform(genre_features)

nbrs = NearestNeighbors(n_neighbors=11)  
nbrs.fit(genre_features_scaled)

target_artist_features = target_artist_data[feature_columns]
target_artist_scaled = scaler.transform(target_artist_features)

distances, indices = nbrs.kneighbors(target_artist_scaled)

similar_artists_indices = indices[0][1:]  
similar_artists = genre_artists_data.iloc[similar_artists_indices]

print(f"Top 10 most similar artists to {target_artist_name} in genre '{target_genre}':")
print(similar_artists[['artist_id', 'artistName', 'popularity', 'energy', 'danceability', "albumName"]])

Top 10 most similar artists to Metallica in genre 'metal':
                      artist_id                artistName  popularity  energy  \
4865879  2ye2Wgw4gimLv2eAKyk1NB                 Metallica          82   0.846   
411820   2d0hyoQ5ynDBnkvAbJKORj  Rage Against The Machine          76   0.869   
411817   2d0hyoQ5ynDBnkvAbJKORj  Rage Against The Machine          76   0.869   
411823   2d0hyoQ5ynDBnkvAbJKORj  Rage Against The Machine          76   0.869   
411821   2d0hyoQ5ynDBnkvAbJKORj  Rage Against The Machine          76   0.869   
4865919  2ye2Wgw4gimLv2eAKyk1NB                 Metallica          82   0.798   
4865922  2ye2Wgw4gimLv2eAKyk1NB                 Metallica          82   0.798   
381008   0cc6vw3VN8YlIcvr1v7tBL               Mötley Crüe          75   0.888   
381006   0cc6vw3VN8YlIcvr1v7tBL               Mötley Crüe          75   0.888   
380996   0cc6vw3VN8YlIcvr1v7tBL               Mötley Crüe          75   0.851   

         danceability                            

### Металлика больше похоже всего на Rage Against Machine. МДа...