In [25]:
import pandas as pd
import glob
import mutagen
from mutagen.id3 import ID3
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [21]:
def extract_metadata_alternative(mp3_filepath):
    audio = ID3(mp3_filepath)
    
    title = None
    artist = None
    
    if "TIT2" in audio:
        title = audio["TIT2"].text[0]
    if "TPE1" in audio:
        artist = audio["TPE1"].text[0]
    
    return title, artist


In [22]:
def extract_info_from_multiple_directories(base_directory, sub_directories):
    # Create an empty dataframe with columns: Genre, File, Title, Artist
    df = pd.DataFrame(columns=["Genre", "File", "Title", "Artist"])

    for genre in sub_directories:
        genre_directory = f"{base_directory}/{genre}"
        
        # Loop through all MP3 files in the sub-directory
        for mp3_file in glob.glob(f"{genre_directory}/*.mp3"):
            filename = mp3_file.split("/")[-1]  # Extracting file name
            title, artist = extract_metadata_alternative(mp3_file)
            df = df.append({"Genre": genre, "File": filename, "Title": title, "Artist": artist}, ignore_index=True)

    return df


In [None]:
base_directory = "/Users/armwong/Desktop/tedAI/emotifymusic"  # Using the current directory for demonstration
sub_directories = ["classical", "electronic", "pop", "rock"]
df_info_multi = extract_info_from_multiple_directories(base_directory, sub_directories)
df_info_multi


In [24]:
def scrape_song_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None, None

    soup = BeautifulSoup(response.text, 'html.parser')
    key = soup.find('td', class_='table_key')
    bpm = soup.find('td', class_='table_bpm')

    key_text = key.get_text(strip=True) if key else None
    bpm_text = bpm.get_text(strip=True) if bpm else None

    return key_text, bpm_text


In [26]:
df_info_multi['Key'] = None
df_info_multi['BPM'] = None

# Base URL for the search query
base_url = 'https://songdata.io/search?query='

# Iterate through each row of the DataFrame, construct the URL, and scrape the data
for index, row in tqdm(df_info_multi.iterrows()):
    # Construct search query using title and artist. This might need URL encoding
    query = f"{row['Title']} {row['Artist']}"
    search_url = base_url + query.replace(' ', '+')

    # Scrape the data
    key, bpm = scrape_song_data(search_url)

    # Store the data back in the DataFrame
    df_info_multi.at[index, 'Key'] = key
    df_info_multi.at[index, 'BPM'] = bpm



400it [04:51,  1.37it/s]


In [92]:
df = pd.read_csv('data.csv')

In [93]:
df.rename(columns={' genre': 'Genre'}, inplace=True)

In [100]:
df['track id'] = df['track id'].apply(lambda x: ((x - 1) % 100) + 1)

In [102]:
df['merge_key'] = df['track id'].astype(str) + '-' + df['Genre']

In [104]:
merged_df = pd.merge(df, df_info_multi, on='merge_key', how='inner')

In [105]:
merged_df

Unnamed: 0,track id,Genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,...,disliked,age,gender,mother tongue,merge_key,Title,Artist,Key,BPM,file_id
0,1,classical,0,1,0,0,0,0,1,1,...,0,21,1,English,1-classical,Trio Sonata in C Major BWV 529-2. Largo (JS Bach),Voices of Music,D♭ Minor,73,1
1,1,classical,0,0,0,1,0,0,0,0,...,1,41,1,Dutch,1-classical,Trio Sonata in C Major BWV 529-2. Largo (JS Bach),Voices of Music,D♭ Minor,73,1
2,1,classical,0,0,0,1,0,0,0,0,...,0,24,1,English,1-classical,Trio Sonata in C Major BWV 529-2. Largo (JS Bach),Voices of Music,D♭ Minor,73,1
3,1,classical,0,0,0,0,1,0,0,0,...,0,32,0,Spanish,1-classical,Trio Sonata in C Major BWV 529-2. Largo (JS Bach),Voices of Music,D♭ Minor,73,1
4,1,classical,0,0,0,1,1,0,0,0,...,1,21,0,English,1-classical,Trio Sonata in C Major BWV 529-2. Largo (JS Bach),Voices of Music,D♭ Minor,73,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8402,100,pop,1,1,0,0,0,0,1,0,...,0,26,1,Russian,100-pop,The One That Got Away,Norine Braun,A Minor,128,100
8403,100,pop,0,0,0,1,0,0,1,0,...,1,29,0,Russian,100-pop,The One That Got Away,Norine Braun,A Minor,128,100
8404,100,pop,0,0,0,0,0,0,0,1,...,1,34,1,Polish,100-pop,The One That Got Away,Norine Braun,A Minor,128,100
8405,100,pop,1,0,0,0,0,0,1,1,...,0,39,1,French,100-pop,The One That Got Away,Norine Braun,A Minor,128,100


In [108]:
merged_df.to_csv('merged_data.csv')

In [109]:
na_rows = merged_df[merged_df.isna().any(axis=1)]

In [111]:
cleaned_df = merged_df.dropna()