In [30]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import os
from pprint import pprint
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np

load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET=os.getenv("CLIENT_SECRET")

spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id = CLIENT_ID, client_secret = CLIENT_SECRET))

First making a simple request to the api to see the data available for a given track. As agreed, we are looking at single albums to begin with so that they are prelabeled with genre.

In [52]:
results = spotify.audio_analysis("spotify:track:11dFghVXANMlKmJXsNCbNl")

for key,item in results.items():
    print(key)
    

meta
track
bars
beats
sections
segments
tatums


In [53]:
track_info = results["track"]

for key, value in track_info.items():
    print(key)

    
   
    

num_samples
duration
sample_md5
offset_seconds
window_seconds
analysis_sample_rate
analysis_channels
end_of_fade_in
start_of_fade_out
loudness
tempo
tempo_confidence
time_signature
time_signature_confidence
key
key_confidence
mode
mode_confidence
codestring
code_version
echoprintstring
echoprint_version
synchstring
synch_version
rhythmstring
rhythm_version


In [54]:

df = pd.json_normalize(results["segments"])
print(df)


         start  duration  confidence  loudness_start  loudness_max_time  \
0      0.00000   0.24317       1.000         -59.656            0.21478   
1      0.24317   0.26240       0.975         -34.039            0.07659   
2      0.50558   0.19596       0.362         -24.576            0.07177   
3      0.70154   0.19891       0.435         -23.053            0.07305   
4      0.90045   0.12082       0.253         -18.084            0.00395   
..         ...       ...         ...             ...                ...   
761  200.91737   0.26009       0.614          -6.545            0.12574   
762  201.17746   0.18009       0.167          -3.485            0.00562   
763  201.35754   0.54050       0.464          -4.748            0.09921   
764  201.89806   1.07964       0.300          -6.688            0.05800   
765  202.97769   4.98218       0.054         -23.839            0.05905   

     loudness_max  loudness_end  \
0         -36.317           0.0   
1         -20.161           0

In this dataframe, each row is a "section" that contains data relating to a tiny section of the song.

In [58]:
row = df.loc[0].copy()
print(row["pitches"])
print(row["timbre"])

for i, row in df.iterrows():
    assert (len(row["pitches"])) == len(row["timbre"])    

[0.769, 1.0, 0.262, 0.182, 0.28, 0.316, 0.129, 0.116, 0.188, 0.213, 0.241, 0.423]
[7.406, 64.456, 37.595, -176.296, 43.329, -43.952, 71.439, -117.9, -41.103, 37.712, 20.429, -8.869]


Each pitch relates to a specific timbre. For the purposes of this experiement, let's assume that each of these relate to each "sound" that has been extracted by spotify. As there always 12 sounds in each segment. Which in itself is a fraction of a second.

Going to look at how to vectorize this data for one sample (one song)

In [33]:
data = list(zip(row["pitches"],row["timbre"]))
arr = np.array(data)
arr

array([[ 1.24000e-01,  4.20160e+01],
       [ 8.13000e-01, -1.44517e+02],
       [ 6.06000e-01, -2.31300e+00],
       [ 3.00000e-01,  2.17400e+01],
       [ 1.61000e-01,  1.72680e+01],
       [ 2.27000e-01, -6.48680e+01],
       [ 6.22000e-01, -4.45630e+01],
       [ 1.00000e+00,  7.69000e+00],
       [ 5.53000e-01, -6.84100e+00],
       [ 1.33000e-01, -1.64700e+01],
       [ 1.10000e-01, -1.23870e+01],
       [ 1.52000e-01,  5.88000e-01]])

Thinking about possibly a 3/4d array. As we do not have data for when specifically each "sound" occured in a segment, we can input the segment duration with the above array so that the model will correctly be able to give importance/weighting to each of the segments.

Something interesting to look at also is whether we can predict the audio features from this "raw" sound data. Or maybe we could even look at a .wav file and see if we can predict the spotify features such as "danceability, acousticness" etc. This would mean we would become less reliant on the spotify data as the universal source of truth.

Going to look at the possiblity of a many to one RNN model.

In [51]:
import numpy as np
data = np.array(list(zip(row["pitches"],row["timbre"])))

meta = np.array([row["duration"],row["start"]])

print(data)
print(meta)

[[ 1.24000e-01  4.20160e+01]
 [ 8.13000e-01 -1.44517e+02]
 [ 6.06000e-01 -2.31300e+00]
 [ 3.00000e-01  2.17400e+01]
 [ 1.61000e-01  1.72680e+01]
 [ 2.27000e-01 -6.48680e+01]
 [ 6.22000e-01 -4.45630e+01]
 [ 1.00000e+00  7.69000e+00]
 [ 5.53000e-01 -6.84100e+00]
 [ 1.33000e-01 -1.64700e+01]
 [ 1.10000e-01 -1.23870e+01]
 [ 1.52000e-01  5.88000e-01]]
[  0.53683 247.16985]
