In [4]:
import os
import re
import json
import pandas as pd
import numpy as np
import lyricsgenius
from dotenv import load_dotenv
import openai

In [5]:
load_dotenv('/Users/Thijs/.env')
genius_key = os.getenv('GENIUS_API')
openai.api_key = os.getenv('OPENAI_API_KEY')

In [6]:
from requests.exceptions import HTTPError, Timeout
from lyricsgenius import Genius

In [7]:
genius = lyricsgenius.Genius(genius_key)
#Turn off warning messages from pandas
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import plotly.express as px

In [8]:
genius = lyricsgenius.Genius(genius_key)
genius.verbose = False # Turn off status messages
genius.remove_section_headers = False # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.skip_non_songs = True # Include hits thought to be non-songs (e.g. track lists)
genius.excluded_terms = ["(Live)"] # Exclude songs with these words in their title

In [9]:
def clean_lyrics(lyrics):
    # Remove all characters before the first "["
    lyrics = re.sub(r'^.*?\[', '[', lyrics)
    
    # Remove text starting from a number followed by "Embed" to the end
    lyrics = re.sub(r'\d+Embed$', '', lyrics)
    
    # Remove lines starting with "See " and ending with "You might also like"
    lyrics = re.sub(r'See .*? You might also like', '', lyrics)
    
    return lyrics

In [10]:
#Create embeddings for each song
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [13]:
artist = "Bon Iver"
albums = ["For Emma, Forever Ago", "Blood Bank EP", "Bon Iver, Bon Iver",
          "22, A Million", "i,i", 
          ]

In [14]:
album_list = []
for album in albums:
    album = genius.search_album(album, artist)
    album_list.append(album)
    print("Completed Saving Album:", album)

Completed Saving Album: Album(id, artist, ...)
Completed Saving Album: Album(id, artist, ...)
Completed Saving Album: Album(id, artist, ...)
Completed Saving Album: Album(id, artist, ...)
Completed Saving Album: Album(id, artist, ...)


In [15]:
boniver_albums = []

for album in album_list:
    boniver_albums.append(album)

In [16]:
len(boniver_albums)

5

In [17]:
#For loop through Bon Iver albums and the list album 
for album in boniver_albums:
    album.save_lyrics()

Wrote Lyrics_ForEmmaForeverAgo.json.
Wrote Lyrics_BloodBankEP.json.
Wrote Lyrics_BonIverBonIver.json.
Wrote Lyrics_22AMillion.json.
Wrote Lyrics_ii.json.


In [51]:
import os
import pandas as pd
import json

# Create an empty list to store dataframes
dfs = []

# Specify the folder path where JSON files are located
folder_path = 'Boniver_Export'

# Iterate through all JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        # Load the JSON file
        with open(os.path.join(folder_path, filename)) as f:
            data = json.load(f)
        
        # Create a dataframe from the 'tracks' dictionary
        df = pd.DataFrame(data['tracks'])
        
        # Break down the 'song' column into separate columns
        df = pd.concat([df.drop(['song'], axis=1), df['song'].apply(pd.Series)], axis=1)
        
        # Add the 'album' column with the album name
        df['album'] = data['name']

        #Add Year column
        df['year'] = data['release_date_components']['year']

        #Add Cover Art column
        #df['cover_art_url'] = data['tracks'][0]['song']['song_art_image_thumbnail_url']
        
        # Append the dataframe to the list
        dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

boniver_df = combined_df[['album', 'year', 'title', 'lyrics', 'song_art_image_thumbnail_url']]
boniver_df['lyrics'] = boniver_df['lyrics'].apply(clean_lyrics)
boniver_df['lyrics'] = boniver_df['lyrics'].apply(lambda x: re.sub(r'\[.*?\]', '', x))
boniver_df['lyrics'] = boniver_df['lyrics'].str.lstrip('\n')
boniver_df['lyrics'] = boniver_df['lyrics'].str.lstrip('\n')
#Run Embeddings
boniver_df['embedding'] = boniver_df['lyrics'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#Save the dataframe as a parquet file
boniver_df.to_parquet('boniver_album_embeddings.parquet')
#Filter for rows where the length of the lyrics is greater than 0
boniver_df = boniver_df[boniver_df['lyrics'].str.len() > 0]
boniver_df.sample(5)

APIRemovedInV1: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [45]:
#Load parquet file as a dataframe
embeddings = pd.read_parquet('boniver_album_embeddings.parquet')
#Join the embedding column to the boniver_df dataframe
boniver_df = boniver_df.merge(embeddings, on=['title', 'album', 'year'], how='left').drop(columns=['lyrics_y']).rename(columns={'lyrics_x': 'lyrics'})

In [46]:
boniver_df.album.unique()

array(['i,i', 'Bon Iver, Bon Iver', '22, A Million',
       'For Emma, Forever Ago', 'Blood Bank EP'], dtype=object)

In [50]:
grouped = boniver_df.groupby('album')

avg_embeddings = {}

for name, group in grouped:
    # Stack embeddings to form a 2D NumPy array
    embeddings_matrix = np.vstack(group['embedding'].values)
    
    # Calculate the mean along axis 0 (i.e., column-wise mean)
    avg_embedding = np.mean(embeddings_matrix, axis=0)
    
    # Store the result in the dictionary
    avg_embeddings[name] = avg_embedding

KeyError: 'embedding'