## Importing the libraries

In [124]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

In [125]:
# Spotify API credentials
client_id = '69f9da2fa5d741779e2cf9d276b09dc0'
client_secret = '85fd8b5e00434f3fbec9f67bb08a4187'

auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.client.Spotify(auth_manager=auth_manager)

## Importing the dataset

- Reading the 1 million playlists and keeping only the unique track URIs for the content-based recommendation system.
- The first for loop Read the 1,000 JSON files one at a time.
- The second for loop is for getting only the unique track.

In [126]:
def loop_slices(path, num_slices=20):
  cnt = 0
  cnt1 = 0
  mpd_playlists = []
  unique_tracks= pd.DataFrame()
  filenames = os.listdir(path)
  for fname in tqdm(sorted(filenames, key=len)):
    if fname.endswith(".json"):
      cnt += 1
      fullpath = os.sep.join((path, fname))
      f = open(fullpath)
      js = f.read()
      f.close()
      current_slice = json.loads(js)
      # Create a list of all playlists
      for playlist in current_slice['playlists']:
        cnt1 +=1
        mpd_playlists.append(playlist)
        if cnt1 == 1000:
          cnt1=0
          temp=pd.DataFrame(mpd_playlists)
          temp=temp.explode('tracks')
          temp=pd.DataFrame(temp['tracks'].apply(pd.Series))
          unique_tracks=pd.concat([unique_tracks,temp],axis=0)
          #unique_tracks.drop_duplicates(subset=['track_uri'],inplace=True)
          mpd_playlists = []
      if cnt == num_slices:
        break
  return unique_tracks

path = '../dataset_sample/'
  
# Path where the json files are located (dataset_sample folder)
df = loop_slices(path, num_slices=1000)
print(df.columns)


100%|██████████| 6/6 [00:24<00:00,  4.10s/it]

Index([            0,         'pos', 'artist_name',   'track_uri',
        'artist_uri',  'track_name',   'album_uri', 'duration_ms',
        'album_name'],
      dtype='object')





In [127]:

import os
output_directory = 'data'
output_file = '1m.csv'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the DataFrame to CSV in the 'data' directory
df.to_csv(os.path.join(output_directory, output_file), index=False)


In [128]:
import numpy as np

# Replace NaN values with an empty string in the specified columns
df['track_uri'].fillna('', inplace=True)
df['artist_uri'].fillna('', inplace=True)
df['album_uri'].fillna('', inplace=True)

# Convert columns to string type
df['track_uri'] = df['track_uri'].astype(str)
df['artist_uri'] = df['artist_uri'].astype(str)
df['album_uri'] = df['album_uri'].astype(str)

# Apply regex operations
df["track_uri"] = df["track_uri"].apply(lambda x: re.findall(r'\w+$', x)[0] if x else np.nan)
df["artist_uri"] = df["artist_uri"].apply(lambda x: re.findall(r'\w+$', x)[0] if x else np.nan)
df["album_uri"] = df["album_uri"].apply(lambda x: re.findall(r'\w+$', x)[0] if x else np.nan)


In [129]:
# List of columns to keep
columns_to_keep = ['track_uri', 'artist_uri', 'album_uri']

# Drop columns except the specified ones
df = df[columns_to_keep]
#df.to_csv(os.path.join(output_directory, output_file), index=False)


In [130]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri'], dtype='object')

In [131]:
t_uri=df["track_uri"].unique()
a_uri=df["artist_uri"].unique()

# Feature extraction

Using the Spotify API for Feature Extraction and Saving Results to a CSV File and Errors to a Log File

I was using SP.track first, but I realised that it would take a lot of time and I would have to counter a lot of Api rate limits, so I used SP.tracks and SP.artists instead. They accept lists with a 50-URI maximum and handle them in a single request, so it took a lot less time.

In [132]:
f = open('data/audio_features.csv', 'a')
e = 0
for i in tqdm(range(0, len(t_uri), 100)):
    try:
        track_feature = sp.audio_features(t_uri[i:i + 100])
        track_df = pd.DataFrame(track_feature)
        # Add header only if the file is empty
        csv_data = track_df.to_csv(header=f.tell() == 0, index=False)
        f.write(csv_data)
    except Exception as error:
        e += 1
        with open("audio_features_log.txt", "a") as r:
            r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + ": " + str(error) + '\n')
        time.sleep(3)
        continue

with open("audio_features_log.txt", "a") as r:
    r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + " _________________________ " +
            "Total Number Of Errors : " + str(e) + " _________________________ " + '\n')

f.close()


 67%|██████▋   | 442/663 [01:47<00:53,  4.13it/s]


KeyboardInterrupt: 

In [108]:
f = open('data/track_features.csv','a')
e=0
for i in tqdm(range(0,len(t_uri),50)):
    try:
        track_features = sp.tracks(t_uri[i:i+50])
        for x in range(50):
            track_pop=pd.DataFrame([t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as error:
        e+=1
        r = open("track_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("track_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
f.close()

100%|██████████| 1325/1325 [40:57<00:00,  1.85s/it]    


In [109]:
f = open('data/artist_features.csv','a')
e=0
for i in tqdm(range(0,len(a_uri),50)):
    try:
        artist_features = sp.artists(a_uri[i:i+50])
        for x in range(50):
            artist_df=pd.DataFrame([a_uri[i+x]])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["artist_pop"] = artist_pop
            if artist_genres: 
                artist_df["genres"] = " ".join([re.sub(' ','_',i) for i in artist_genres])
            else:
              artist_df["genres"] = "unknown"
            csv_data = artist_df.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as error:
        e+=1
        r = open("artist_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("artist_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
f.close()

100%|██████████| 282/282 [01:55<00:00,  2.45it/s]


In [114]:
df_aud = pd.read_csv('data/audio_features.csv')
df_aud

Unnamed: 0,0.634,0.862,2,-5.337,1,0.0376,0.0287,0.0,0.0217,0.767,95.989,audio_features,1zWZvrk13cL8Sl3VLeG57F,spotify:track:1zWZvrk13cL8Sl3VLeG57F,https://api.spotify.com/v1/tracks/1zWZvrk13cL8Sl3VLeG57F,https://api.spotify.com/v1/audio-analysis/1zWZvrk13cL8Sl3VLeG57F,227693,4
0,0.595,0.389,2,-9.245,1,0.0349,0.418000,0.000001,0.1160,0.400,166.052,audio_features,5kNe7PE09d6Kvw5pAsx23n,spotify:track:5kNe7PE09d6Kvw5pAsx23n,https://api.spotify.com/v1/tracks/5kNe7PE09d6K...,https://api.spotify.com/v1/audio-analysis/5kNe...,227427,4
1,0.543,0.837,11,-6.448,1,0.0608,0.117000,0.000000,0.0872,0.506,168.056,audio_features,6GsAD8PgHxmEuIPTG8GP3M,spotify:track:6GsAD8PgHxmEuIPTG8GP3M,https://api.spotify.com/v1/tracks/6GsAD8PgHxmE...,https://api.spotify.com/v1/audio-analysis/6GsA...,186080,4
2,0.688,0.702,7,-4.792,0,0.0499,0.021500,0.000000,0.1280,0.740,94.006,audio_features,6Knv6wdA0luoMUuuoYi2i1,spotify:track:6Knv6wdA0luoMUuuoYi2i1,https://api.spotify.com/v1/tracks/6Knv6wdA0luo...,https://api.spotify.com/v1/audio-analysis/6Knv...,192191,4
3,0.457,0.948,10,-3.364,1,0.0354,0.019100,0.000000,0.0536,0.878,148.000,audio_features,0i5el041vd6nxrGEU8QRxy,spotify:track:0i5el041vd6nxrGEU8QRxy,https://api.spotify.com/v1/tracks/0i5el041vd6n...,https://api.spotify.com/v1/audio-analysis/0i5e...,208960,4
4,0.564,0.714,7,-4.987,1,0.1290,0.054900,0.000000,0.1550,0.330,175.924,audio_features,37f4ITSlgPX81ad2EvmVQr,spotify:track:37f4ITSlgPX81ad2EvmVQr,https://api.spotify.com/v1/tracks/37f4ITSlgPX8...,https://api.spotify.com/v1/audio-analysis/37f4...,204013,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131382,0.916,0.742,7,-12.440,1,0.1880,0.000288,0.004980,0.0986,0.545,127.809,audio_features,2PBTwMH2mzfLigdMyPzOcp,spotify:track:2PBTwMH2mzfLigdMyPzOcp,https://api.spotify.com/v1/tracks/2PBTwMH2mzfL...,https://api.spotify.com/v1/audio-analysis/2PBT...,257640,4
131383,0.667,0.916,6,-3.264,1,0.0743,0.165000,0.000011,0.1620,0.558,100.019,audio_features,5JugcqxQihVYdvCSPzmP1H,spotify:track:5JugcqxQihVYdvCSPzmP1H,https://api.spotify.com/v1/tracks/5JugcqxQihVY...,https://api.spotify.com/v1/audio-analysis/5Jug...,183559,4
131384,0.775,0.602,8,-10.732,0,0.1390,0.000280,0.878000,0.1740,0.552,136.965,audio_features,1cKRBp7hrBVD4eP3W9x2AI,spotify:track:1cKRBp7hrBVD4eP3W9x2AI,https://api.spotify.com/v1/tracks/1cKRBp7hrBVD...,https://api.spotify.com/v1/audio-analysis/1cKR...,412427,4
131385,0.741,0.841,2,-6.398,0,0.2160,0.070200,0.000000,0.1460,0.730,94.846,audio_features,38griAVM808crjbFp9gcPD,spotify:track:38griAVM808crjbFp9gcPD,https://api.spotify.com/v1/tracks/38griAVM808c...,https://api.spotify.com/v1/audio-analysis/38gr...,339573,4
