In [None]:
# import libraries
import numpy as np
import pandas as pd
import os

# %pip install spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


In [None]:
# manage spotify credentials
cid =''
secret =''

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
"""
import spotify csv files with customised settings
"""
def import_csv(file_path: str):
  return pd.read_csv(file_path, header=0, index_col=0)

# check if CompileData folder exist
import_folder_path = os.path.join("..", "datasets", "CompileData")
if not os.path.exists(import_folder_path):
  raise Exception("Dataset CompileData not found! Run scraper.ipynb to get the datasets") 

# export folder path
spotify_eda_folder_path = os.path.join("..", "datasets", "SpotifyAudioFeatures")
if not os.path.exists(spotify_eda_folder_path):
  os.makedirs(spotify_eda_folder_path)


In [None]:
def get_spotify_data(uid_list: list[str]):
  combined_info_list: list[dict] = []
  chunk_size = 50

  for i in range(0, len(uid_list), chunk_size):
    split_list = uid_list[i:i+chunk_size]
    info_list = sp.audio_features(split_list)
    combined_info_list.extend(info_list)

  return combined_info_list

In [None]:
# get 2017-2021 csv files and get the danceability scores
start_year = 2017
end_year = 2021

while start_year <= end_year:
  print("Staring with", start_year)
  path = os.path.join(
    import_folder_path,
    str(start_year) + "_spotify_data_and_lyrics.csv"
  )
  df: pd.DataFrame = pd.read_csv(path, header=0, index_col=0)
  # drop lyrics to save memory, we probably don't need it 
  df.drop(columns=["lyrics"], inplace=True)

  uid_list: list[str] = df['uid'].values.tolist()
  spotify_data_list = get_spotify_data(uid_list)
  df_spotify_data = pd.DataFrame.from_dict(spotify_data_list)

  df_combined = pd.merge(df, df_spotify_data, left_on=["uid"], right_on=["id"])
  df_combined.drop(columns=["id", "uri", "track_href", "analysis_url"], inplace=True)

  # export to excel
  df_combined.to_csv(
    spotify_eda_folder_path + "/" + str(start_year) + "_spotify_audio_features.csv", 
    encoding="utf-8-sig"
  )

  start_year = start_year + 1

print("End of Program")