In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import getpass
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
client_id = str(getpass.getpass('client_id?'))
client_sectret = str(getpass.getpass('client_secret?'))
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=client_id, client_secret=client_sectret))

In [3]:
def all_albums_from_artist(artist_id):
    albums_ids = []
    albums = sp.artist_albums(artist_id)
    for album in range(len(albums['items'])):
        albums_ids.append(albums['items'][album]['id'])
    return list(set(albums_ids))

In [4]:
def all_songs_from_albums(albums_id_list):
    ids = []
    for album_id in tqdm(albums_id_list):
        songs_ids = sp.album_tracks(album_id)
        for id in range(len(songs_ids['items'])):
            ids.append(songs_ids['items'][id]['id'])         
    ids_chunks = []
    for i in tqdm(range(0, len(ids), 100)):
        ids_chunks.append(ids[i:i + 100])
    audio_feat = []
    for i in tqdm(ids_chunks):
        audio_feat.append(sp.audio_features(tracks=i))
    audio_feat_total = []
    for i in tqdm(range(len(audio_feat))):
        audio_feat_total += audio_feat[i]
    audio_feats_clean = [dct for dct in audio_feat_total if dct]
    df = pd.DataFrame(audio_feats_clean)
    return df.drop(['type', 'track_href', 'uri', 'analysis_url'], axis=1)
    

In [5]:
def release_year(df):
    years = []
    for id in tqdm(df['id']):
        track = sp.track(id)
        years.append(track['album']['release_date'][0:4])
    return years

In [6]:
def fix_tempo(df):
    df['tempo'] = df['tempo'].apply(lambda x: x*2 if x < 60 else x)
    df['tempo'] = df['tempo'].apply(lambda x: x/2 if x > 200 else x)
    df['tempo'] = df['tempo'].apply(lambda x: 120 if x == 0 else x)
    return df

In [144]:
def genre_predict():
    artist_name = input('Enter artist name:')
    print('Listening to', artist_name.upper())
    artist = sp.search(artist_name)
    artist_id = artist['tracks']['items'][0]['artists'][0]['id']
    albums = all_albums_from_artist(artist_id)
    songs = all_songs_from_albums(albums)
    print(len(songs), 'songs')
    songs = fix_tempo(songs)
    X = songs.drop(['id'], axis=1)
    prediction = forest_model.predict(X)
    pred_genres = np.unique(prediction, return_counts=True)
    genres_list = ['jazz', 'rock', 'hip-hop', 'pop', 'electronic']
    genres_dict = dict(zip(pred_genres[0], pred_genres[1]))
    if len(genres_dict) > 1:
        genres_sorted = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
        main_genre = genres_list[int(genres_sorted[0][0])-1]
        print('Sounds like', artist_name.upper(), 'plays', main_genre, 'music')
    else:
        print('Sounds like', artist_name.upper(), 'plays', genres_list[int(pred_genres[0])-1], 'music')

In [145]:
def genre_predict2():
    artist_name = input('Enter artist name:')
    print('Listening to', artist_name.upper())
    artist = sp.search(artist_name)
    artist_id = artist['tracks']['items'][0]['artists'][0]['id']
    albums = all_albums_from_artist(artist_id)
    songs = all_songs_from_albums(albums)
    print(len(songs), 'songs')
    print('Getting release years (can take a while)')
    songs['year'] = release_year(songs)
    songs = fix_tempo(songs)
    X = songs.drop(['id'], axis=1)
    prediction = forest_model2.predict(X)
    pred_genres = np.unique(prediction, return_counts=True)
    genres_list = ['jazz', 'rock', 'hip-hop', 'pop', 'electronic']
    genres_dict = dict(zip(pred_genres[0], pred_genres[1]))
    if len(genres_dict) > 1:
        genres_sorted = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
        main_genre = genres_list[int(genres_sorted[0][0])-1]
        print('Sounds like', artist_name.upper(), 'plays', main_genre, 'music')
    else:
        print('Sounds like', artist_name.upper(), 'plays', genres_list[int(pred_genres[0])-1], 'music')

In [8]:
music = pd.read_csv('data/music.csv')

In [9]:
music.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre,year
0,0,0.706,0.742,5,-6.431,1,0.1,0.178,0.0191,0.433,0.299,82.483,6t4CS8bsKY5Gu1LwvfgATh,310680,4,1,2007
1,1,0.842,0.42,7,-9.656,1,0.112,0.0743,0.0964,0.122,0.583,92.005,4c0FWOg4R7KsSlq4vdQDrX,311080,4,1,2007
2,2,0.766,0.963,10,-5.0,1,0.286,0.584,0.00565,0.465,0.692,98.023,04zDTgL1znpSw5SkobWMk9,240960,4,1,2007
3,3,0.65,0.827,5,-8.784,1,0.0452,0.00127,0.468,0.117,0.248,96.016,1dioz5qr8Rxm2ADIKkyaHc,303333,4,1,2007
4,4,0.749,0.863,7,-6.865,1,0.131,0.0112,0.157,0.0969,0.627,87.959,1AE7Kt2nz88jjaqyOYEg7r,228360,4,1,2007


In [10]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142783 entries, 0 to 142782
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        142783 non-null  int64  
 1   danceability      142783 non-null  float64
 2   energy            142783 non-null  float64
 3   key               142783 non-null  int64  
 4   loudness          142783 non-null  float64
 5   mode              142783 non-null  int64  
 6   speechiness       142783 non-null  float64
 7   acousticness      142783 non-null  float64
 8   instrumentalness  142783 non-null  float64
 9   liveness          142783 non-null  float64
 10  valence           142783 non-null  float64
 11  tempo             142783 non-null  float64
 12  id                142783 non-null  object 
 13  duration_ms       142783 non-null  int64  
 14  time_signature    142783 non-null  int64  
 15  genre             142783 non-null  int64  
 16  year              14

In [11]:
music['genre'].value_counts()

5    37730
2    34062
4    26390
1    25968
3    18633
Name: genre, dtype: int64

In [12]:
len(music.loc[music['tempo'] == 0])

0

In [13]:
len(music.loc[music['tempo'] > 200])

0

In [14]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142783 entries, 0 to 142782
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        142783 non-null  int64  
 1   danceability      142783 non-null  float64
 2   energy            142783 non-null  float64
 3   key               142783 non-null  int64  
 4   loudness          142783 non-null  float64
 5   mode              142783 non-null  int64  
 6   speechiness       142783 non-null  float64
 7   acousticness      142783 non-null  float64
 8   instrumentalness  142783 non-null  float64
 9   liveness          142783 non-null  float64
 10  valence           142783 non-null  float64
 11  tempo             142783 non-null  float64
 12  id                142783 non-null  object 
 13  duration_ms       142783 non-null  int64  
 14  time_signature    142783 non-null  int64  
 15  genre             142783 non-null  int64  
 16  year              14

## Fixing the tempo from 60 to 200 BPM

In [15]:
#music.to_csv('music.csv, index=False')

# It 1

In [None]:
y = music['genre']
X = music.drop(['Unnamed: 0', 'genre', 'id', 'years'], axis=1)

In [118]:
r_tree = RandomForestClassifier(n_estimators=186, max_depth=196, max_features='log2', bootstrap=False)

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
forest_model = r_tree.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

In [115]:
accuracy_score(y_test, y_pred)

0.8701450316555162

In [104]:
param_grid = [
{'n_estimators': range(100, 200), 'max_depth': range(30, 300), 'max_features': ['auto', 'sqrt', 'log2']},
{'bootstrap': [False], 'n_estimators': range(100, 200), 'max_depth': range(30, 300), 'max_features': ['auto', 'sqrt', 'log2']},
]
forest_class = RandomForestClassifier()
grid_random = RandomizedSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_random.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions=[{'max_depth': range(30, 300),
                                         'max_features': ['auto', 'sqrt',
                                                          'log2'],
                                         'n_estimators': range(100, 200)},
                                        {'bootstrap': [False],
                                         'max_depth': range(30, 300),
                                         'max_features': ['auto', 'sqrt',
                                                          'log2'],
                                         'n_estimators': range(100, 200)}],
                   return_train_score=True, scoring='neg_mean_squared_error')

In [None]:
grid_random.best_params_

{'n_estimators': 186,
 'max_features': 'sqrt',
 'max_depth': 196,
 'bootstrap': False}

In [None]:
cvres = grid_random.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.8575637286956567 {'n_estimators': 189, 'max_features': 'auto', 'max_depth': 231}
0.8424916569965112 {'n_estimators': 130, 'max_features': 'log2', 'max_depth': 276, 'bootstrap': False}
0.8401013715296373 {'n_estimators': 185, 'max_features': 'auto', 'max_depth': 45, 'bootstrap': False}
0.8633004931290037 {'n_estimators': 173, 'max_features': 'log2', 'max_depth': 126}
0.8379167198672101 {'n_estimators': 186, 'max_features': 'sqrt', 'max_depth': 196, 'bootstrap': False}
0.8622074499874272 {'n_estimators': 102, 'max_features': 'log2', 'max_depth': 90}
0.8385365772180744 {'n_estimators': 138, 'max_features': 'log2', 'max_depth': 104, 'bootstrap': False}
0.8394654614932588 {'n_estimators': 169, 'max_features': 'auto', 'max_depth': 197, 'bootstrap': False}
0.8590659061422943 {'n_estimators': 138, 'max_features': 'log2', 'max_depth': 143}
0.8612304502712474 {'n_estimators': 125, 'max_features': 'sqrt', 'max_depth': 192}


In [110]:
param_grid = [
{'bootstrap': [False], 'max_features': ['log2', 'sqrt', 'auto'], 'n_estimators': [186, 138, 169], 'max_depth': [196, 104, 197]},]
forest_class = RandomForestClassifier()
grid_search = GridSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'bootstrap': [False], 'max_depth': [196, 104, 197],
                          'max_features': ['log2', 'sqrt', 'auto'],
                          'n_estimators': [186, 138, 169]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [117]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.8417947886952329 {'bootstrap': False, 'max_depth': 196, 'max_features': 'log2', 'n_estimators': 186}
0.8451542631010868 {'bootstrap': False, 'max_depth': 196, 'max_features': 'log2', 'n_estimators': 138}
0.8431139481468152 {'bootstrap': False, 'max_depth': 196, 'max_features': 'log2', 'n_estimators': 169}
0.8406051841971591 {'bootstrap': False, 'max_depth': 196, 'max_features': 'sqrt', 'n_estimators': 186}
0.8414345915875927 {'bootstrap': False, 'max_depth': 196, 'max_features': 'sqrt', 'n_estimators': 138}
0.8431995782511502 {'bootstrap': False, 'max_depth': 196, 'max_features': 'sqrt', 'n_estimators': 169}
0.8413716999451758 {'bootstrap': False, 'max_depth': 196, 'max_features': 'auto', 'n_estimators': 186}
0.8440207131566922 {'bootstrap': False, 'max_depth': 196, 'max_features': 'auto', 'n_estimators': 138}
0.8426858225686871 {'bootstrap': False, 'max_depth': 196, 'max_features': 'auto', 'n_estimators': 169}
0.842006156035994 {'bootstrap': False, 'max_depth': 104, 'max_features': 

In [112]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=197, max_features='log2',
                       n_estimators=186)

error_rate

In [116]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.83      0.85      0.84      7807
           2       0.86      0.86      0.86     10226
           3       0.84      0.86      0.85      7226
           4       0.86      0.82      0.84      8045
           5       0.92      0.94      0.93     11238

    accuracy                           0.87     44542
   macro avg       0.87      0.87      0.87     44542
weighted avg       0.87      0.87      0.87     44542



# It 2

In [138]:
y2 = music['genre']
X2 = music.drop(['Unnamed: 0', 'genre', 'id'], axis=1)

In [139]:
r_tree2 = RandomForestClassifier(n_estimators=175, max_depth=185, max_features='log2', bootstrap=False)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=40)
forest_model2 = r_tree2.fit(X_train2, y_train2)
y_pred2 = forest_model2.predict(X_test2)
accuracy_score(y_test2, y_pred2)

In [20]:
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': forest_model.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi

Unnamed: 0,feature,importance
6,acousticness,0.144174
3,loudness,0.134417
0,danceability,0.116744
5,speechiness,0.099363
1,energy,0.098274
7,instrumentalness,0.088261
11,duration_ms,0.084672
9,valence,0.077698
8,liveness,0.057693
10,tempo,0.055101


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [72]:
param_grid = [
{'n_estimators': range(100, 200), 'max_depth': range(30, 300), 'max_features': ['auto', 'sqrt', 'log2']},
{'bootstrap': [False], 'n_estimators': range(100, 200), 'max_depth': range(30, 300), 'max_features': ['auto', 'sqrt', 'log2']},
]
forest_class = RandomForestClassifier()
grid_random = RandomizedSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_random.fit(X_train2, y_train2)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions=[{'max_depth': range(30, 300),
                                         'max_features': ['auto', 'sqrt',
                                                          'log2'],
                                         'n_estimators': range(100, 200)},
                                        {'bootstrap': [False],
                                         'max_depth': range(30, 300),
                                         'max_features': ['auto', 'sqrt',
                                                          'log2'],
                                         'n_estimators': range(100, 200)}],
                   return_train_score=True, scoring='neg_mean_squared_error')

In [None]:
grid_random.best_params_

{'n_estimators': 175,
 'max_features': 'log2',
 'max_depth': 124,
 'bootstrap': False}

In [None]:
cvres = grid_random.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.8715588897194868 {'n_estimators': 115, 'max_features': 'log2', 'max_depth': 31}
0.8516202980453987 {'n_estimators': 192, 'max_features': 'log2', 'max_depth': 185, 'bootstrap': False}
0.8748302305955142 {'n_estimators': 139, 'max_features': 'sqrt', 'max_depth': 194}
0.8739090203232966 {'n_estimators': 124, 'max_features': 'sqrt', 'max_depth': 256}
0.8548098826579991 {'n_estimators': 100, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': False}
0.8723277891925433 {'n_estimators': 133, 'max_features': 'auto', 'max_depth': 99}
0.8465239047372541 {'n_estimators': 175, 'max_features': 'log2', 'max_depth': 124, 'bootstrap': False}
0.8728494236745805 {'n_estimators': 177, 'max_features': 'auto', 'max_depth': 109}
0.8509150445950879 {'n_estimators': 154, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
0.8677796823327274 {'n_estimators': 148, 'max_features': 'auto', 'max_depth': 46}


In [80]:
param_grid = [
{'bootstrap': [False], 'max_features': ['log2'], 'n_estimators': [154, 175, 192], 'max_depth': [35, 124, 185]},
]
forest_class = RandomForestClassifier()
grid_search = GridSearchCV(forest_class, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train2, y_train2)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'bootstrap': [False], 'max_depth': [35, 124, 185],
                          'max_features': ['log2'],
                          'n_estimators': [154, 175, 192]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [81]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.8514440251191804 {'bootstrap': False, 'max_depth': 35, 'max_features': 'log2', 'n_estimators': 154}
0.851332366839121 {'bootstrap': False, 'max_depth': 35, 'max_features': 'log2', 'n_estimators': 175}
0.8519374837719975 {'bootstrap': False, 'max_depth': 35, 'max_features': 'log2', 'n_estimators': 192}
0.8541658958930421 {'bootstrap': False, 'max_depth': 124, 'max_features': 'log2', 'n_estimators': 154}
0.8523600740228342 {'bootstrap': False, 'max_depth': 124, 'max_features': 'log2', 'n_estimators': 175}
0.8514969359872422 {'bootstrap': False, 'max_depth': 124, 'max_features': 'log2', 'n_estimators': 192}
0.8510033330426251 {'bootstrap': False, 'max_depth': 185, 'max_features': 'log2', 'n_estimators': 154}
0.849638395527666 {'bootstrap': False, 'max_depth': 185, 'max_features': 'log2', 'n_estimators': 175}
0.8510032280562068 {'bootstrap': False, 'max_depth': 185, 'max_features': 'log2', 'n_estimators': 192}


In [27]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=100, n_estimators=130)

error_rate

In [133]:
print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           1       0.85      0.85      0.85      7884
           2       0.87      0.86      0.87     10150
           3       0.83      0.87      0.85      5553
           4       0.87      0.83      0.85      7969
           5       0.93      0.94      0.93     11279

    accuracy                           0.88     42835
   macro avg       0.87      0.87      0.87     42835
weighted avg       0.88      0.88      0.88     42835



In [146]:
genre_predict2()

Listening to QUEEN


100%|██████████| 20/20 [00:01<00:00, 17.01it/s]
100%|██████████| 3/3 [00:00<00:00, 15968.16it/s]
100%|██████████| 3/3 [00:00<00:00,  4.31it/s]
100%|██████████| 3/3 [00:00<00:00, 18423.00it/s]
  1%|▏         | 3/233 [00:00<00:10, 21.24it/s]

233 songs
Getting release years (can take a while)


100%|██████████| 233/233 [00:20<00:00, 11.40it/s]

Sounds like QUEEN plays rock music



