In [None]:
# macOS
%pip3 install lyricsgenius

# others
# pip install lyricsgenius


In [None]:
# import files
import pandas as pd
import os
import lyricsgenius as lg

In [None]:
"""
import spotify csv files with customised settings
"""
def import_csv(file_path: str):
  return pd.read_csv(file_path, header=0)

In [None]:
"""
combine all songs
- calculate sum of streams
"""
def combine_all_songs(df: pd.DataFrame):
  all_songs = df.to_dict('records')
  top_songs = {}
  top_songs_list = []

  for song in all_songs:
    url: str = song['uri']
    # get the url unique id, potentially songs share`` the same name but unlikely
    # spotify:track:XXXXXX
    # we obtain the XXXXXX
    uid = url.split(':')[-1]
    song_detail = top_songs.get(uid)

    # if None, means the song has yet to be added
    if (song_detail != None):
      # sum streams up together
      streams = song_detail['streams'] + song['streams']
      song_detail['streams'] = streams
      top_songs[uid] = song_detail

    else: 
      track_name = song['track_name']
      artist = song['artist_names']
      streams = song['streams']
      top_songs[uid] = {
        "track_name": track_name,
        "artist": artist,
        "streams": streams,
        "uid": uid,
      }
        

  # convert dict to array
  top_songs_list = list(top_songs.values())
  # sort array
  sorted_list = sorted(top_songs_list, key=lambda d: d['streams'], reverse=True)
  
  return sorted_list

In [None]:
"""
iterate and get songs from dict
"""
def find_song_lyrics(song_list: list):
  new_song_list = []

  for song in song_list:
    lyrics = get_lyrics(
      song.get('track_name', ''),
      song.get('artist', '')
    )
    song['lyrics'] = lyrics
    new_song_list.append(song)

  return new_song_list


# declare constant Genius
token = ""
genius = lg.Genius(
  token, 
  skip_non_songs=True, 
  excluded_terms=["(Remix)", "(Live)", "(Original Motion Picture Soundtrack)"], 
  remove_section_headers=True,
  timeout=10
)

"""
get lyrics api
return string: lyrics
"""
def get_lyrics(song_title: str, song_artist: str):
  lyrics = ""

  if len(song_title) > 0 and len(song_artist) > 0:
    song = genius.search_song(title=song_title, artist=song_artist)
  elif len(song_title) > 0:
    song =  genius.search_song(title=song_title)
  else:
    print("Not enough information for", song_title, song_artist)

  if song == None:
    print("[No lyrics found]:", song_title, "by", song_artist)

  else:
    # remove the song title
    lyrics = song.lyrics
    lyrics_split = lyrics.split('\n')
    lyrics_split.pop()
    lyrics = "\n".join(map(str, lyrics_split))
  return lyrics

In [None]:
start_year = 2017
end_year = 2021

while start_year <= end_year:
  # get path of "Spotify XXXX"
  path = os.path.join("Data", "Spotify " + str(start_year))

  print("Starting with", start_year)

  # find csv files in the folder
  csv_file_path = []
  for f in os.listdir(path):
    if f.endswith('.csv'):
      csv_file_path.append(os.path.join(path, f))

  # lambda function:
  # import csv from the list of csv path above to create a single dataframe
  try:
    df: pd.DataFrame = pd.concat(map(import_csv, csv_file_path))
    combined_songs_list = combine_all_songs(df)
    combined_songs_lyrics_list = find_song_lyrics(combined_songs_list)
    df_song_lyrics = pd.DataFrame.from_dict(combined_songs_lyrics_list)
    print("Writing to file")
    # export to excel
    export_path = os.path.join("Data", "CompileData", str(start_year) + "_spotify_data_and_lyrics.csv")
    df_song_lyrics.to_csv(export_path)

  except:
    print("An error occured")

  start_year = start_year + 1
  


# print(combined_songs_lyrics_list)
# print(df_song_lyrics)
# print(df.to_dict('records'))