In [None]:
# import libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import  FuncFormatter

# %pip install spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


In [None]:
# manage spotify credentials
cid =''
secret =''

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
"""
import spotify csv files with customised settings
"""
def import_csv(file_path: str):
  return pd.read_csv(file_path, header=0, index_col=0)

# check if CompileData folder exist
import_folder_path = os.path.join("..", "datasets", "CompileData")
if not os.path.exists(import_folder_path):
  raise Exception("Dataset CompileData not found! Run scraper.ipynb to get the datasets") 

# export folder path
spotify_eda_folder_path = os.path.join("..", "datasets", "SpotifyAudioFeatures")
if not os.path.exists(spotify_eda_folder_path):
  os.makedirs(spotify_eda_folder_path)

# export image path
image_export_folder_path = os.path.join("..", "images", "EDA")
if not os.path.exists(image_export_folder_path):
  os.makedirs(image_export_folder_path)

## Get audio features of songs from spotify

In [None]:
def get_spotify_data(uid_list: list[str]):
  combined_info_list: list[dict] = []
  # we split into chunks to reduce number of calls to spotify
  # spotify chunk_size limitations is actually 100, but use 50 to be safe
  chunk_size = 50

  for i in range(0, len(uid_list), chunk_size):
    split_list = uid_list[i:i+chunk_size]
    info_list = sp.audio_features(split_list)
    combined_info_list.extend(info_list)

  return combined_info_list

In [None]:
# get 2017-2021 csv files and get the danceability scores
start_year = 2017
end_year = 2021

while start_year <= end_year:
  print("Staring with", start_year)
  path = os.path.join(
    import_folder_path,
    str(start_year) + "_spotify_data_and_lyrics.csv"
  )
  df: pd.DataFrame = pd.read_csv(path, header=0, index_col=0)
  # drop lyrics to save memory, we probably don't need it 
  df.drop(columns=["lyrics"], inplace=True)

  uid_list: list[str] = df['uid'].values.tolist()
  spotify_data_list = get_spotify_data(uid_list)
  df_spotify_data = pd.DataFrame.from_dict(spotify_data_list)

  df_combined = pd.merge(df, df_spotify_data, left_on=["uid"], right_on=["id"])
  df_combined.drop(columns=["id", "uri", "track_href", "analysis_url"], inplace=True)

  # export to excel
  df_combined.to_csv(
    spotify_eda_folder_path + "/" + str(start_year) + "_spotify_audio_features.csv", 
    encoding="utf-8-sig"
  )

  start_year = start_year + 1


In [None]:
# load csv file create above
songs2017 = import_csv(os.path.join(spotify_eda_folder_path, '2017_spotify_audio_features.csv'))
songs2018 = import_csv(os.path.join(spotify_eda_folder_path, '2018_spotify_audio_features.csv'))
songs2019 = import_csv(os.path.join(spotify_eda_folder_path, '2019_spotify_audio_features.csv'))
songs2020 = import_csv(os.path.join(spotify_eda_folder_path, '2020_spotify_audio_features.csv'))
songs2021 = import_csv(os.path.join(spotify_eda_folder_path, '2021_spotify_audio_features.csv'))

songs2017["year"] = 2017
songs2018["year"] = 2018
songs2019["year"] = 2019
songs2020["year"] = 2020
songs2021["year"] = 2021


## Compare Audio Features between years

In [None]:
# eda comparision between years
# create a single df containing everything
df_songs_overall = pd.concat([songs2017, songs2018, songs2019, songs2020, songs2021])
df_songs_overall.head(5)


In [None]:
# Lets plot the streams over years
df_songs_yearly_streams = pd.DataFrame()

for year in range(2017, 2021 + 1):
  temp_df: pd.DataFrame = df_songs_overall.loc[df_songs_overall['year'] == year]
  yearly_row = {
    "year": year,
    "total_streams": temp_df['streams'].sum()
  }
  df_songs_yearly_streams = df_songs_yearly_streams.append(yearly_row, ignore_index=True )

df_songs_yearly_streams['year'] = df_songs_yearly_streams['year'].astype('int')
df_songs_yearly_streams['total_streams'] = df_songs_yearly_streams['total_streams'].astype('int')
df_songs_yearly_streams.set_index('year', inplace=True)

ax = df_songs_yearly_streams.plot(title="Total Number of Streams per Year", figsize=(16, 9), kind="bar", legend=False)
ax.set_ylabel("Total Streams", fontsize=12)
ax.set_xlabel("Year", fontsize=12)
ax.bar_label(ax.containers[0], fmt='%d')

# remove scientific notation
ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))

plt.xticks(rotation=0)
plt.draw()
plt.savefig(os.path.join(image_export_folder_path, "yearly_streams"), facecolor="white")

In [None]:
print("End of Program")