In [None]:
# macOS
%pip3 install lyricsgenius

# others
# pip install lyricsgenius


In [None]:
# import files
import pandas as pd
import os
import lyricsgenius as lg
import re

In [None]:
"""
import spotify csv files with customised settings
"""
def import_csv(file_path: str):
  return pd.read_csv(file_path, header=0)

In [None]:
"""
combine all songs
- calculate sum of streams
"""
def combine_all_songs(df: pd.DataFrame):
  all_songs = df.to_dict('records')
  top_songs = {}
  top_songs_list = []

  for song in all_songs:
    url: str = song['uri']
    # get the url unique id, potentially songs share`` the same name but unlikely
    # spotify:track:XXXXXX
    # we obtain the XXXXXX
    uid = url.split(':')[-1]
    song_detail = top_songs.get(uid)

    # if None, means the song has yet to be added
    if (song_detail != None):
      # sum streams up together
      streams = song_detail['streams'] + song['streams']
      song_detail['streams'] = streams
      top_songs[uid] = song_detail

    else: 
      track_name = song['track_name']
      artist = song['artist_names']
      streams = song['streams']
      top_songs[uid] = {
        "track_name": track_name,
        "artist": artist,
        "streams": streams,
        "uid": uid,
      }
        

  # convert dict to array
  top_songs_list = list(top_songs.values())
  # sort array
  sorted_list = sorted(top_songs_list, key=lambda d: d['streams'], reverse=True)
  
  return sorted_list

In [None]:
"""
iterate and get songs from dict
"""
def find_song_lyrics(song_list: list):
  new_song_list = []

  for song in song_list[:30]:
    track_name = song.get('track_name', '')
    artist = song.get('artist', '')
    lyrics = ""

    try: 
      lyrics = get_lyrics(track_name, artist)

    except Exception as error:
      # print error and log it
      print('[' + track_name + ']:', error)
      list_of_songs_with_error.append({
        'track_name': song.get('track_name', ''),
        'artist': song.get('artist', ''),
        'year': start_year
      })

    song['lyrics'] = lyrics
    new_song_list.append(song)
      

  return new_song_list

In [None]:
# declare constant Genius
token = ""
genius = lg.Genius(
  token, 
  skip_non_songs=True, 
  excluded_terms=["(Remix)", "(Live)", "(Original Motion Picture Soundtrack)"], 
  remove_section_headers=True,
  timeout=10,
  sleep_time=0.5,
  retries=3
)

remove_paranthesis_pattern = r'\([^()]*\)'
remove_embed_number_pattern = r'[\d]+[Embed]+'

"""
get lyrics api
return string: lyrics
"""

def get_lyrics(song_title: str, song_artist: str):
  lyrics = ""
  trimmed_song_title = re.sub(remove_paranthesis_pattern, "" , song_title).strip()
  first_artist = song_artist.split(',')[0].strip()

  if len(song_title) > 0 and len(song_artist) > 0:
    song = genius.search_song(title=trimmed_song_title, artist=first_artist)
  elif len(song_title) > 0:
    song =  genius.search_song(title=trimmed_song_title)
  else:
    raise Exception("Not enough information for", trimmed_song_title, first_artist)

  if song == None:
    raise Exception("No lyrics found with Genius!")


  # cleanup:
    # remove song lyrics header XXXX lyric
    # and remove song lyrics emded [Number]Embed (e.g. 200Embed)
  lyrics = song.lyrics
  lyrics_split = lyrics.split('\n')

  # if lyrics is not lyrics
  if ((trimmed_song_title + " Lyrics").lower() not in lyrics_split[0].lower()):
    raise Exception("No lyrics found with Genius!")

  else: 
    lyrics_remove_title_list = lyrics_split[1:]
    lyrics_clean_last_line = re.sub(remove_embed_number_pattern, "", lyrics_split[-1]).strip()
    lyrics = ""
    for index in range(len(lyrics_remove_title_list)):
      # if last position, replace with out cleaned line
      if index == len(lyrics_remove_title_list) - 1:
        lyrics = lyrics + lyrics_clean_last_line
      else:
        lyrics = lyrics + lyrics_remove_title_list[index] + '\n'

  return lyrics


In [None]:
start_year = 2017
end_year = 2017
list_of_songs_with_error = []

while start_year <= end_year:
  print("Starting with", start_year)

  # get path of "Spotify XXXX"
  path = os.path.join("Data", "Spotify " + str(start_year))

  # find csv files in the folder
  csv_file_path = []
  for f in os.listdir(path):
    if f.endswith('.csv'):
      csv_file_path.append(os.path.join(path, f))

  # lambda function:
  # import csv from the list of csv path above to create a single dataframe
  df: pd.DataFrame = pd.concat(map(import_csv, csv_file_path))
  combined_songs_list = combine_all_songs(df)
  combined_songs_lyrics_list = find_song_lyrics(combined_songs_list)
  df_song_lyrics = pd.DataFrame.from_dict(combined_songs_lyrics_list)
  
  # export to excel
  print("Done fetching songs of ", start_year)
  export_path = os.path.join("Data", "CompileData", str(start_year) + "_spotify_data_and_lyrics.csv")
  df_song_lyrics.to_csv(export_path, encoding="utf-8-sig")

  print(start_year, "written to file")
  start_year = start_year + 1
  
# export logged errors to songs_errors.csv to be manually source
if len(list_of_songs_with_error) > 0:
  print(len(list_of_songs_with_error), "song erros found!, exporting to csv")
  df_error = pd.DataFrame.from_dict(list_of_songs_with_error)
  error_path = os.path.join("Data", "CompileData", "songs_errors.csv")
  df_error.to_csv(error_path, encoding="utf-8-sig")

print("End of Program")