In [5]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [12]:
sp_df = pd.read_csv('SpotifyFeatures.csv')

In [13]:
sp_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [14]:
sp_df.dtypes

genre                object
artist_name          object
track_name           object
track_id             object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

In [41]:
sp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232725 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 11  liveness          232725 non-null  float64
 12  loudness          232725 non-null  float64
 13  mode              232725 non-null  object 
 14  speechiness       232725 non-null  float64
 15  tempo             232725 non-null  float64
 16  time_signature    23

In [16]:
sp_df.shape

(232725, 18)

In [18]:
sp_df['genre'].values[0]

'Movie'

In [19]:
sp_df['artist_name'].values[0]

'Henri Salvador'

In [59]:
sp_df[sp_df['artist_name']=='Harry Styles']

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
13728,Dance,Harry Styles,Sign of the Times,5Ohxk2dO5COHF1krpoPigN,79,0.0275,0.516,340707,0.595,0.0,F,0.109,-4.63,Major,0.0313,119.972,4/4,0.222
14004,Dance,Harry Styles,Kiwi,33SNO8AaciGbNaQFkxvPrW,72,0.00064,0.375,176387,0.93,0.0,D,0.318,-2.631,Major,0.0561,147.124,4/4,0.491
14214,Dance,Harry Styles,Two Ghosts,4B1rpPmQXwj78wk6aIGwwU,72,0.234,0.386,229813,0.407,2e-06,F#,0.0999,-7.095,Minor,0.0268,69.387,4/4,0.456
14563,Dance,Harry Styles,Woman,33bURv895AN4FkBvgFo2dx,66,0.0169,0.624,278800,0.647,0.0038,D#,0.221,-4.68,Minor,0.0335,136.022,4/4,0.379
14622,Dance,Harry Styles,Ever Since New York,5LABCxgmP7DATATIJXOh6n,66,0.2,0.389,253387,0.537,3e-06,A#,0.125,-6.761,Major,0.0309,127.964,5/4,0.383
14687,Dance,Harry Styles,Carolina,3Dwq1oiOFys6Sxr5SslmXw,66,0.000569,0.624,189613,0.673,2e-06,E,0.104,-4.017,Minor,0.0268,121.986,4/4,0.458
14883,Dance,Harry Styles,Only Angel,5Lbsc65org0b85kNsPkluY,65,0.287,0.554,291080,0.842,0.0,F,0.106,-4.113,Major,0.0397,114.036,4/4,0.348
17726,Dance,Harry Styles,"Two Ghosts - Recorded at Metropolis Studios, L...",1GDF5AusV7p0wwi4OVNz3i,57,0.475,0.535,224256,0.231,0.0,F#,0.102,-10.827,Minor,0.0311,133.625,4/4,0.291
108591,Pop,Harry Styles,Sign of the Times,5Ohxk2dO5COHF1krpoPigN,79,0.0275,0.516,340707,0.595,0.0,F,0.109,-4.63,Major,0.0313,119.972,4/4,0.222
108948,Pop,Harry Styles,"Girl Crush - Recorded at Metropolis Studios, L...",4YutJsNwBBInE8vemufpQ8,74,0.658,0.468,243805,0.197,0.0,E,0.109,-11.727,Major,0.033,144.212,3/4,0.249


In [36]:
sp_df[sp_df['track_id']=='5Q0Nhxo0l2bP3pNjpGJwV1'].nunique()

genre               2
artist_name         1
track_name          1
track_id            1
popularity          1
acousticness        1
danceability        1
duration_ms         1
energy              1
instrumentalness    1
key                 1
liveness            1
loudness            1
mode                1
speechiness         1
tempo               1
time_signature      1
valence             1
dtype: int64

In [54]:
artists_genres_consolidated = sp_df.groupby('track_id')['genre'].apply(list).reset_index()

In [61]:
artists_genres_consolidated['genre_lists'] = artists_genres_consolidated['genre']

In [62]:
artists_genres_consolidated

Unnamed: 0,track_id,genre,genre_lists
0,00021Wy6AyMbLP2tqij86e,[Anime],[Anime]
1,000CzNKC8PEt1yC3L8dqwV,[Movie],[Movie]
2,000DfZJww8KiixTKuk9usJ,[Reggae],[Reggae]
3,000EWWBkYaREzsBplYjUag,[Jazz],[Jazz]
4,000xQL6tZNLJzIrtIgxqSl,"[Dance, Pop]","[Dance, Pop]"
...,...,...,...
176769,7zz7MbCb9G7KJc1NVl9bL0,[Jazz],[Jazz]
176770,7zzFNNxVD0h0ctAT08H0pa,[Jazz],[Jazz]
176771,7zzTeItz93lYI52hlcipm5,[Reggaeton],[Reggaeton]
176772,7zzZmpw8L66ZPjH1M6qmOs,"[Children’s Music, Indie]","[Children’s Music, Indie]"


In [64]:
sp_df=sp_df.merge(artists_genres_consolidated[['track_id','genre_lists']], on = 'track_id',how = 'left')

In [77]:
sp_df.drop(['genre'], axis=1, inplace=True)