In [36]:
import pandas as pd
import numpy as np
from utils import load_data, search_duck_selenium

# Load and preprocess data
data = load_data("data/songs.csv")



def fill_missing_links(songs_links, data):
    # Find rows where 'link' is missing or an empty string
    missing_links = songs_links['link'].isna() | (songs_links['link'] == '')

    if not any(missing_links):
        print("No missing links to fill.")
        return songs_links

    # Iterate over rows with missing links
    for index in songs_links[missing_links].index:
        title = songs_links.loc[index, 'Title']
        year_column = pd.DataFrame(data[['(1980s)', '(1990s)', '(2000s)', '(2010s)', '(2020s)']]).eq(1, axis=0).idxmax(axis=1)[index]
        query = f"song called: {title} {year_column} inurl:youtube -lyrics"

        try:
            link = search_duck_selenium(query)
        except Exception as e:
            print(f"Error searching YouTube for '{title}': {str(e)}")
            continue

        # Update the link in the 'songs_links' DataFrame
        songs_links.loc[index, 'link'] = link
        print(f"Link for '{title}' updated.")

    return songs_links

In [22]:

songs_links = pd.read_csv("data/songs_links.csv")
songs_links = fill_missing_links(songs_links, data)

Link for 'A Sky Full of Stars' updated.
Link for 'Back in Black' updated.
Link for 'Dream On' updated.
Link for 'Teardrop' updated.
Link for 'Sorry' updated.
Link for 'Paradise City' updated.
Link for 'Paranoid' updated.
Link for 'Someone Like You' updated.
Link for 'Us and Them' updated.
Error: list index out of range
Link for 'Iris' updated.
Link for 'Black Hole Sun' updated.
Link for 'HUMBLE.' updated.
Link for 'Eye of the Tiger' updated.
Link for 'The Pretender' updated.
Link for 'Faint' updated.
Link for 'Breaking the Habit' updated.
Link for 'Sugar (feat. Francesco Yates)' updated.
Error: list index out of range
Link for 'Treat You Better' updated.
Link for 'Let It Happen' updated.
Link for 'Thunder' updated.
Link for 'Believer' updated.
Link for 'Trouble' updated.
Link for 'Coming Back to Life' updated.
Link for 'All of Me' updated.
Link for 'Gimme Shelter' updated.
Link for 'Here Without You' updated.
Link for 'Beautiful Day' updated.
Link for 'Africa' updated.
Link for 'Work' 

In [23]:
# Second round of missing links retrieval
# This could be done recursively till 'missing_links' is empty or directly as retries in an exception handler 
# but the manual approach leaves more room for an appropriate cooldown in the case of a soft ban
songs_links = fill_missing_links(songs_links, data)

Link for 'Iris' updated.
Link for 'Treat You Better' updated.
Link for 'What You Know' updated.
Link for 'King Kunta' updated.
Link for 'Cheerleader - Felix Jaehn Remix Radio Edit' updated.
Link for 'Shallow' updated.
Link for 'Hands to Myself' updated.
Link for 'Lights' updated.
Link for 'Fight Song' updated.
Link for 'Nothing Arrived - Live from Spotify London' updated.
Link for 'Dust in the Wind' updated.
Link for 'You Can't Always Get What You Want' updated.
Link for 'Marvin Gaye (feat. Meghan Trainor)' updated.
Link for 'Around the World' updated.
Error: list index out of range
Link for 'When I Was Your Man' updated.
Link for 'Pink Moon' updated.


In [24]:
songs_links = fill_missing_links(songs_links, data)

Link for 'When I Was Your Man' updated.


In [25]:
songs_links.to_csv("data/songs_links.csv", index=False)

In [26]:
songs_links = pd.read_csv('data/songs_links.csv')

In [43]:
# We delete links that aren't from youtube.com (8 links) and search manually
non_yt_links_idx = songs_links.loc[~songs_links['link'].str.contains('youtube.com', na = False)].index
songs_links.loc[non_yt_links_idx, 'link'] = np.nan

In [44]:
songs_links.to_csv("data/songs_links.csv", index=False)

The resulting csv file can now be edited manually
