In [116]:
import pandas as pd
from tqdm.notebook import tqdm
import spotipy # install if needed
from spotipy.oauth2 import SpotifyClientCredentials
import getpass
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


In [3]:
client_id = str(getpass.getpass('client_id?'))
client_sectret = str(getpass.getpass('client_secret?'))

In [4]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=client_id, client_secret=client_sectret))

In [140]:
jazz = pd.read_csv('data/jazz_songs.csv')

In [141]:
rock = pd.read_csv('data/rock_clean.csv')

In [142]:
hiphop = pd.read_csv('data/hiphop_songs.csv')

In [143]:
pop = pd.read_csv('data/pop_songs.csv')

In [144]:
rock.drop(['Unnamed: 0.1'], inplace=True, axis=1)

In [145]:
jazz['genre'] = 1

In [146]:
jazz.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre
0,0,0.523,0.279,4,-21.715,0,0.0466,0.936,0.921,0.974,0.594,97.101,04yW9pKk983TGpcHmn8KQP,330667,4,1
1,1,0.335,0.154,1,-25.574,1,0.0499,0.984,0.914,0.856,0.113,184.044,5vXJmlnBBQwl9RE6Vg75fU,299533,3,1
2,2,0.424,0.184,5,-23.582,0,0.0395,0.963,0.891,0.683,0.246,79.817,3WuNyS0A9Qlb9UXxM51VkU,377560,3,1
3,3,0.416,0.424,2,-17.458,1,0.0459,0.98,0.939,0.633,0.385,108.886,66j2M4V2NQsyWmR0OgnA5v,284667,4,1
4,4,0.298,0.111,7,-24.838,1,0.0676,0.995,0.944,0.663,0.0398,69.56,3xUifJSdMNedjkIpOoNsMC,819560,4,1


In [147]:
rock['genre'] = 2

In [148]:
hiphop['genre'] = 3

In [149]:
pop['genre'] = 4

In [150]:
test_df = pd.concat([jazz, rock, hiphop, pop], axis=0, ignore_index=True)

In [151]:
test_df.tail()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre
83803,28828,0.505,0.68,7,-4.99,1,0.0713,0.0796,0.0,0.0465,0.539,168.021,3MaQvpdNb4Vv7J7K9jYw1t,235653,3,4
83804,28829,0.371,0.329,0,-9.667,0,0.0574,0.131,0.00625,0.279,0.0539,173.661,6MkumkB800IlwJXGOln4Th,340493,4,4
83805,28830,0.501,0.403,9,-9.607,0,0.0752,0.114,1.1e-05,0.124,0.27,132.608,41oeGHsIkl8UfgwEbyRuP8,260640,3,4
83806,28831,0.55,0.404,0,-12.755,0,0.0398,0.441,0.00022,0.11,0.329,135.016,3dkeFXyOlKg0THR22hxvsv,274960,4,4
83807,28832,0.626,0.583,10,-7.574,0,0.0435,0.105,0.0,0.115,0.335,127.956,3Df2uB1b5XDbJAiu2xOHvH,377347,4,4


In [152]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83808 entries, 0 to 83807
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        83808 non-null  int64  
 1   danceability      83808 non-null  float64
 2   energy            83808 non-null  float64
 3   key               83808 non-null  int64  
 4   loudness          83808 non-null  float64
 5   mode              83808 non-null  int64  
 6   speechiness       83808 non-null  float64
 7   acousticness      83808 non-null  float64
 8   instrumentalness  83808 non-null  float64
 9   liveness          83808 non-null  float64
 10  valence           83808 non-null  float64
 11  tempo             83808 non-null  float64
 12  id                83808 non-null  object 
 13  duration_ms       83808 non-null  int64  
 14  time_signature    83808 non-null  int64  
 15  genre             83808 non-null  int64  
dtypes: float64(9), int64(6), object(1)
memor

In [153]:
y = test_df['genre']
X = test_df.drop(['genre', 'id', 'Unnamed: 0'], axis=1)

In [154]:
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.523,0.279,4,-21.715,0,0.0466,0.936,0.921,0.974,0.594,97.101,330667,4
1,0.335,0.154,1,-25.574,1,0.0499,0.984,0.914,0.856,0.113,184.044,299533,3
2,0.424,0.184,5,-23.582,0,0.0395,0.963,0.891,0.683,0.246,79.817,377560,3
3,0.416,0.424,2,-17.458,1,0.0459,0.98,0.939,0.633,0.385,108.886,284667,4
4,0.298,0.111,7,-24.838,1,0.0676,0.995,0.944,0.663,0.0398,69.56,819560,4


In [155]:
r_tree = RandomForestClassifier()

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=40)


In [157]:
forest_model = r_tree.fit(X_train,y_train)

In [158]:
y_pred = forest_model.predict(X_test)

In [159]:
accuracy_score(y_test, y_pred)

0.8417849898580122

In [160]:
from sklearn.metrics import classification_report

In [161]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.87      0.92      0.89      6521
           2       0.85      0.81      0.83      7312
           3       0.76      0.69      0.73      2639
           4       0.84      0.85      0.85      8671

    accuracy                           0.84     25143
   macro avg       0.83      0.82      0.82     25143
weighted avg       0.84      0.84      0.84     25143

