In [16]:
import pandas as pd
import json
import re
from urllib.parse import unquote
import discogs_client as DiscogsClient
from thefuzz import process
import os

SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')
DISCOGS_USER_TOKEN = os.getenv('DISCOGS_USER_TOKEN')
YTMUSIC_REFRESH_TOKEN = os.getenv('YTMUSIC_REFRESH_TOKEN')
YTMUSIC_ACCESS_TOKEN = os.getenv('YTMUSIC_ACCESS_TOKEN')

discogs_client = DiscogsClient.Client('ExampleApplication/0.1', user_token=DISCOGS_USER_TOKEN)

In [79]:
def df_to_nested_dict(df):
    dict_result = (
        df.groupby('band')
        .apply(lambda x: x.groupby('album')['song'].apply(list).to_dict())
        .to_dict()
    )
    return dict_result

def find_band_exact(band_name):
    """Returns the artist object for searches that have only one exact match, ignoring case. If multiple results are returned, returns None"""
    band_results = discogs_client.search(band_name, type='artist')
    band_names = [result.name.lower() for result in band_results]
    band_name = band_name.lower()
    pattern = re.compile(fr'{band_name} \(\d\)')
    has_multiple = sum(map(lambda x: 1 if re.findall(pattern, x) else 0, band_names)) > 1

    if band_names.count(band_name) == 1 and not has_multiple:
        return band_results[band_names.index(band_name)]
    return None

def find_band_by_albums(band_name, albums):
    """Returns the artist object for searches with multiple results, by checking the album list. Returns the artist if any albums match the given list, otherwise None"""
    band_results = discogs_client.search(band_name, type='artist')
    albums = [album.lower() for album in albums]

    for band in band_results:
        releases = [release.title.lower() for release in band.releases]
        if any(set(releases).intersection(set(albums))):
            return band
    return None

def find_band_by_tracklist(band, albums_dict):
    band_results = discogs_client.search(band, type='artist')
    band_results_dict = {band.name: band for band in band_results}
    candidates = [pair for pair in process.extract(band, band_results_dict.keys(), limit=100) if pair[1] > 50]
    candidates_list = [band_results_dict[band[0]] for band in candidates]
    for dg_band in candidates_list:
        albums_result = dg_band.releases
        albums_result_dict = {album.title: album for album in albums_result}
        albums_list = list(albums_dict.keys())
        for album in albums_list:
            candidates = [pair for pair in process.extract(album, albums_result_dict.keys(), limit=100) if pair[1] > 50]
            tracks = albums_dict[album]
            for candidate in candidates:
                tracklist_result = [track.title for track in albums_result_dict[candidate[0]].tracklist]
                if bool(set(tracklist_result).intersection(set(tracks))):
                    return dg_band
    return None


In [80]:
df_raw = pd.read_pickle('with_discogs_ids_2.pkl')
df_multiple = df_raw.loc[df_raw['discogs_id'] == 'None']
data_dict = df_to_nested_dict(df_multiple)

i = 0
for band, albums in data_dict.items():
    try:
        band_obj = find_band_by_tracklist(band, albums)
    except Exception:
        continue
    band_id = band_obj.id if band_obj else None
    df_raw.loc[df_raw['band'] == band, 'discogs_id'] = str(band_id)

    print(i, band, band_obj, band_id)
    i+=1




0 23 Skidoo None None
1 3rd Mind None None
2 A&E Department None None
3 Absent Minded None None
4 Afro's, The None None
5 Alien Faktor w/ Pain Station None None
6 Almighty El Cee, The None None
7 Alpha Team, The None None
8 Anthrax None None
9 Anti Groups, The None None
10 Architect None None
11 Arrakis None None
12 Astral Project None None


Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '???']


13 Avery, Brendan None None
14 Bass Inc. None None
15 Beefcake None None
16 Bombardier None None
17 Boyd Rice and Fiends None None
18 Brume None None
19 Capone-N-Noreaga None None
20 Cazazza, Monte None None
21 Charlatans, The None None
22 Code None None
23 Collide None None
24 Coma Lilies, The None None
25 Complex None None
26 Crackhead None None
27 Crisis N.T.I. None None
28 Cujo None None
29 Curse, The None None
30 Cyber-Tec None None
31 D-Nice None None
32 D.H.S. None None
33 DJ Weirdo & Guitar Rob None None
34 Deist Requeim None None
35 Diabolis Rising None None
36 Disposable Heroes of Hiphoprisy None None
37 Divine Misfire None None
38 Doof None None
39 Doormouse None None
40 Doppelganger None None
41 Downtime None None
42 Emphasis None None
43 Endorphin None None
44 Eric B and Rakim None None
45 Excessive Force None None
46 FM Einheit None None
47 Filta None None
48 Fires Of Ork, The None None
49 First Contact None None
50 Gossamer None None
51 Growling Mad Scientists None None


KeyboardInterrupt: 

In [81]:
df_raw.to_pickle('with_discogs_ids_3.pkl')

In [31]:
json_result = json.dumps(df_to_nested_dict(df_bands))

# with open('band_album_song_data.json', 'w') as file:
#     file.write(json_result)

with open('band_album_song_data.json', 'r') as file:
    json_data = json.load(file)

In [35]:
# Maps the artist to artist ID

# df_bands['discogs_id'] = ''

# i = 0
# for band in df_bands['band'].drop_duplicates():
#     try:
#         band_obj = find_band(band)
#     except Exception:
#         continue
#     band_id = band_obj.id if band_obj else None
#     df_bands.loc[df_bands['band'] == band, 'discogs_id'] = str(band_id)
#     print(i, band, band_obj, band_id)
#     i+=1

# display(df_bands)
