In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import getpass
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [6]:
client_id = str(getpass.getpass('client_id?'))
client_sectret = str(getpass.getpass('client_secret?'))
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=client_id, client_secret=client_sectret))

In [7]:
def all_albums_from_artist(artist_id):
    albums_ids = []
    albums = sp.artist_albums(artist_id)
    for album in range(len(albums['items'])):
        albums_ids.append(albums['items'][album]['id'])
    return list(set(albums_ids))

In [8]:
def all_songs_from_albums(albums_id_list):
    ids = []
    for album_id in tqdm(albums_id_list):
        songs_ids = sp.album_tracks(album_id)
        for id in range(len(songs_ids['items'])):
            ids.append(songs_ids['items'][id]['id'])         
    ids_chunks = []
    for i in tqdm(range(0, len(ids), 100)):
        ids_chunks.append(ids[i:i + 100])
    audio_feat = []
    for i in tqdm(ids_chunks):
        audio_feat.append(sp.audio_features(tracks=i))
    audio_feat_total = []
    for i in tqdm(range(len(audio_feat))):
        audio_feat_total += audio_feat[i]
    audio_feats_clean = [dct for dct in audio_feat_total if dct]
    df = pd.DataFrame(audio_feats_clean)
    return df.drop(['type', 'track_href', 'uri', 'analysis_url'], axis=1)
    

In [9]:
def release_year(df):
    years = []
    for id in tqdm(df['id']):
        track = sp.track(id)
        years.append(track['album']['release_date'][0:4])
    return years

In [12]:
def fix_tempo(df):
    df['tempo'] = df['tempo'].apply(lambda x: x*2 if x < 60 else x)
    df['tempo'] = df['tempo'].apply(lambda x: x/2 if x > 200 else x)
    df['tempo'] = df['tempo'].apply(lambda x: 120 if x == 0 else x)
    return df

In [123]:
def genre_predict():
    artist_name = input('Enter artist name:')
    print('Listening to', artist_name.upper())
    artist = sp.search(artist_name)
    artist_id = artist['tracks']['items'][0]['artists'][0]['id']
    albums = all_albums_from_artist(artist_id)
    songs = all_songs_from_albums(albums)
    # print('Getting release years (this is gonna take a while...)')
    # songs['year'] = release_year(songs)
    songs = fix_tempo(songs)
    X = songs.drop(['id'], axis=1)
    prediction = forest_model.predict(X)
    pred_genres = np.unique(prediction, return_counts=True)
    genres_list = ['jazz', 'rock', 'hip-hop', 'pop', 'electronic']
    genres_dict = dict(zip(pred_genres[0], pred_genres[1]))
    if len(genres_dict) > 1:
        genres_sorted = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
        main_genre = genres_list[int(genres_sorted[0][0])-1]
        print('Sounds like', artist_name.upper(), 'plays', main_genre, 'music')
    else:
        print('Sounds like', artist_name.upper(), 'plays', genres_list[int(pred_genres[0])-1], 'music')

In [72]:
music = pd.read_csv('data/music.csv')

In [73]:
music.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre,year
0,0,0.706,0.742,5,-6.431,1,0.1,0.178,0.0191,0.433,0.299,82.483,6t4CS8bsKY5Gu1LwvfgATh,310680,4,1,2007
1,1,0.842,0.42,7,-9.656,1,0.112,0.0743,0.0964,0.122,0.583,92.005,4c0FWOg4R7KsSlq4vdQDrX,311080,4,1,2007
2,2,0.766,0.963,10,-5.0,1,0.286,0.584,0.00565,0.465,0.692,98.023,04zDTgL1znpSw5SkobWMk9,240960,4,1,2007
3,3,0.65,0.827,5,-8.784,1,0.0452,0.00127,0.468,0.117,0.248,96.016,1dioz5qr8Rxm2ADIKkyaHc,303333,4,1,2007
4,4,0.749,0.863,7,-6.865,1,0.131,0.0112,0.157,0.0969,0.627,87.959,1AE7Kt2nz88jjaqyOYEg7r,228360,4,1,2007


In [74]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142783 entries, 0 to 142782
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        142783 non-null  int64  
 1   danceability      142783 non-null  float64
 2   energy            142783 non-null  float64
 3   key               142783 non-null  int64  
 4   loudness          142783 non-null  float64
 5   mode              142783 non-null  int64  
 6   speechiness       142783 non-null  float64
 7   acousticness      142783 non-null  float64
 8   instrumentalness  142783 non-null  float64
 9   liveness          142783 non-null  float64
 10  valence           142783 non-null  float64
 11  tempo             142783 non-null  float64
 12  id                142783 non-null  object 
 13  duration_ms       142783 non-null  int64  
 14  time_signature    142783 non-null  int64  
 15  genre             142783 non-null  int64  
 16  year              14

In [75]:
music['genre'].value_counts()

5    37730
2    34062
4    26390
1    25968
3    18633
Name: genre, dtype: int64

In [76]:
len(music.loc[music['tempo'] == 0])

0

In [77]:
len(music.loc[music['tempo'] > 200])

0

In [78]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142783 entries, 0 to 142782
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        142783 non-null  int64  
 1   danceability      142783 non-null  float64
 2   energy            142783 non-null  float64
 3   key               142783 non-null  int64  
 4   loudness          142783 non-null  float64
 5   mode              142783 non-null  int64  
 6   speechiness       142783 non-null  float64
 7   acousticness      142783 non-null  float64
 8   instrumentalness  142783 non-null  float64
 9   liveness          142783 non-null  float64
 10  valence           142783 non-null  float64
 11  tempo             142783 non-null  float64
 12  id                142783 non-null  object 
 13  duration_ms       142783 non-null  int64  
 14  time_signature    142783 non-null  int64  
 15  genre             142783 non-null  int64  
 16  year              14

## Fixing the tempo from 60 to 200 BPM

In [79]:
#music.to_csv('music.csv, index=False')

In [125]:
y = music['genre']
X = music.drop(['Unnamed: 0', 'genre', 'id', 'year'], axis=1)

In [166]:
r_tree = RandomForestClassifier(n_estimators=130, max_depth=260, bootstrap=False)

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
forest_model = r_tree.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

In [168]:
accuracy_score(y_test, y_pred)

0.8597875569044006

In [129]:
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': forest_model.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi

Unnamed: 0,feature,importance
3,loudness,0.131767
6,acousticness,0.131266
0,danceability,0.116587
1,energy,0.116272
5,speechiness,0.101458
11,duration_ms,0.084715
7,instrumentalness,0.083625
9,valence,0.078666
8,liveness,0.057729
10,tempo,0.053906


In [149]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [161]:
param_grid = [
{'n_estimators': range(30, 200), 'max_depth': range(2, 300)},
{'bootstrap': [False], 'n_estimators': range(30, 200), 'max_depth': range(2, 300)},
]
forest_class = RandomForestClassifier()
grid_search = RandomizedSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions=[{'max_depth': range(2, 300),
                                         'n_estimators': range(30, 200)},
                                        {'bootstrap': [False],
                                         'max_depth': range(2, 300),
                                         'n_estimators': range(30, 200)}],
                   return_train_score=True, scoring='neg_mean_squared_error')

In [162]:
grid_search.best_params_

{'n_estimators': 130, 'max_depth': 260, 'bootstrap': False}

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [169]:
param_grid = [
{'n_estimators': [130, 183, 200], 'max_depth': [100, 200, 260]},
{'bootstrap': [False], 'n_estimators': [130, 183, 200], 'max_depth': [100, 200, 260]},
]
forest_class = RandomForestClassifier()
grid_search = GridSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [153]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

nan {'n_estimators': 34, 'max_features': 24, 'bootstrap': False}
nan {'n_estimators': 136, 'max_features': 109}
nan {'n_estimators': 30, 'max_features': 150}
nan {'n_estimators': 7, 'max_features': 161, 'bootstrap': False}
0.960614344651708 {'n_estimators': 44, 'max_features': 11}
nan {'n_estimators': 36, 'max_features': 118, 'bootstrap': False}
nan {'n_estimators': 199, 'max_features': 29}
nan {'n_estimators': 122, 'max_features': 148}
nan {'n_estimators': 39, 'max_features': 94}
nan {'n_estimators': 150, 'max_features': 107}


In [154]:
grid_search.best_estimator_

RandomForestClassifier(max_features=11, n_estimators=44)

error_rate

In [130]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.83      0.82      0.82      7884
           2       0.85      0.84      0.84     10150
           3       0.81      0.86      0.84      5553
           4       0.84      0.81      0.82      7969
           5       0.90      0.92      0.91     11279

    accuracy                           0.85     42835
   macro avg       0.85      0.85      0.85     42835
weighted avg       0.85      0.85      0.85     42835



In [136]:
genre_predict()

Listening to BILLIE EILISH


100%|██████████| 20/20 [00:01<00:00, 11.02it/s]
100%|██████████| 1/1 [00:00<00:00, 213.85it/s]
100%|██████████| 1/1 [00:00<00:00,  6.61it/s]
100%|██████████| 1/1 [00:00<00:00, 831.21it/s]


Sounds like BILLIE EILISH plays jazz music
