1001 Tracklists Scraper
=======================
A set of functions to scrape music tracklists from [1001 Tracklists](https://www.1001tracklists.com)

Import a bunch of stuff

In [None]:
from bs4 import BeautifulSoup as bs
import requests 
import pandas as pd
import urllib3
import os
import spotipy
import spotipy.util as util
from pprint import pprint
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
root_url = 'https://www.1001tracklists.com'

Get track data from spotify and return it

In [None]:
client_id='6389b29d73fc4806ba5e812e678854c1'
client_secret='0b4bcd832e694aedad408b5b4a93dd5c'
ccm=util.oauth2.SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp =spotipy.Spotify(client_credentials_manager=ccm)

def get_attrs(artist, track):
    try:
        q ="artist:"+artist+" track:"+track
        res = sp.search(q=q, type="track")
        track_res = res['tracks']['items'][0]
        track_id = track_res['uri']
        deets = sp.audio_features(track_id)
        camelot = get_camelot(deets[0]['key'], deets[0]['mode'])
        deets[0]['camelot'] = camelot
        return pd.Series(deets[0])
    except Exception as e:
        #print(e)
        return False
    
#get_attrs(artist='Armin Van Buuren', track="shivers")

In [None]:
def get_camelot(key, mode):
    # index of letter is spotify pitch class - e.g. 0->c, 1->c# etc.
    tones = [ 'c', 'c#', 'd', 'd#', 'e', 'f', 'f#', 'g', 'g#', 'a', 'a#', 'b' ]
    keys = {
        'a0': '8A',
        'a1': '11B',
        'a#0': '3A',
        'a#1': '6B',
        'b0': '10A',
        'b1': '1B',
        'c0': '5A',
        'c1': '8B',
        'c#0': '12A',
        'c#1': '3B',
        'd0': '7A',
        'd1': '10B',
        'd#0': '2A',
        'd#1': '5B',
        'e0': '9A',
        'e1': '12B',
        'f0': '4A',
        'f1': '7B',
        'f#0': '11A',
        'f#1': '2B',
        'g0': '6A',
        'g1': '9B',
        'g#0': '1A',
        'g#1': '4B',
    }
    key_letter = tones[key]
    return keys[key_letter+str(mode)]

Get a tracklist from 1001 Tracklists and write it to a CSV with Spotify track info for all songs it can find

In [None]:
def tsplit(s, sep):
    stack = [s]
    for char in sep:
        pieces = []
        for substr in stack:
            pieces.extend(substr.split(char))
        stack = pieces
    return stack

def get_tracklist(url, folder='.'):
    os.makedirs(folder, exist_ok=True)
    
    !wget {url} -q
    fname = url.split('/')[-1]
    #print("Fname: "+fname)
    soup = bs(open(fname), "lxml")
    !rm {fname}
    
    tracklist = pd.DataFrame(columns=['Artist(s)', 'Title', 'Release'])
    set_name = soup.find(id="pageTitle").get_text().strip()
    print(set_name)
    
    for div in soup.select('.trackValue'):
        try:
            text = div.get_text()
            artist_part = text.split('-')[0]
            artists_raw = tsplit(artist_part, ('&','vs.', 'ft.', 'pres.'))
            for index in range(len(artists_raw)):
                artists_raw[index] = artists_raw[index].strip()
            artists = ",".join(artists_raw)
            #print("Artists: "+artists)
            
            # title after '-' but before both label ([) and release (() and if a mashup, remove all but first song to make it easier to look up
            title = text.split('-')[1].split('[')[0].split('(')[0].split('vs.')[0].strip()
            #print("title: "+title)
            
            # Releases in braces
            try:
                release = text.split('(')[1].split(')')[0].strip()
            except:
                release = "unknown"
            #print("release: "+release)
            
            # Label in []
            try:
                label = text.split('[')[1].split(']')[0].strip()
            except:
                label = "uknown"

            basic_details = pd.Series([artists, title, release], index=['Artist(s)', 'Title', 'Release'])
            
            #don't search spotify for unknown version, just search base name
            if release == 'unknown':
                release = ''
                
            spaced_artists = artists.replace(',', ' ')
            spotify_details = get_attrs(artist=spaced_artists+' '+release, track=title)
            # Try removing version if not found
            if spotify_details is False:
                for a in artists.split(','):
                    spotify_details = get_attrs(artist=a, track=title)
                    #print(f'Trying again with artist: "{a}" title: "{title}"')
                    if spotify_details is not False:
                        break
            # Set details to none instead of false if not found, so track won't get excluded
            if spotify_details is False:
                spotify_details = None
                print(f'Not Found: "{title}" by "{artists}"')
            row = pd.concat([basic_details, spotify_details])
            tracklist = tracklist.append(row, ignore_index=True)
        except Exception as e:
            print(e)
            pass
    tracklist.to_csv(f'{folder}/{set_name}.csv')
    
#get_tracklist(url='https://www.1001tracklists.com/tracklist/1kjuxf4t/giuseppe-ottaviani-go-on-air-fsoe-stage-tomorrowland-belgium-2018-08-21.html', folder='otaviani_test')

Get a whole series of tracklists, and put them in a folder

In [None]:
def get_series_tracklists(series_url, folder='.', recursecall=False):
    fname=series_url.split('/')[-1]
    doesnt_exist = !ls | grep fname
    if doesnt_exist != 0:
        !wget {series_url} -q
        soup = bs(open(fname), "lxml")
        !rm {fname}
    else:
        print(fname+" already exists, skipping...")
    main = soup.find(id='mainContentDiv')
    for mix_link in main.find_all('a', href=True):
        mix_href = mix_link['href']
        if mix_href.startswith('/tracklist/'):
            webpage = '/'.join(series_url.split('/')[0:3])
            get_tracklist(webpage+mix_link['href'], folder=folder)
        
    if recursecall == False:
        page_div = soup.find('ul', class_='pagination')
        other_pages = page_div.find_all('a', href=True)
        dont_follow = ['Prev', '1', 'Next']
        try:
            cur_page_num = int(series_url.split('/')[-1].split('.')[-2][-1])
        except:
            cur_page_num = 1
        for page in other_pages:
            try:
                page_num = int(page.get_text())
            except:
                continue
            if page_num > cur_page_num:
                group = series_url[0:series_url.rfind('/')]
                print(group)
                get_series_tracklists(group+'/'+page['href'], folder=folder, recursecall=True)
                
#get_series_tracklists(series_url='https://www.1001tracklists.com/groups/nlzzgw/evans-picks/index9.html', folder='Evans_Picks')

In [None]:
def get_all_series(url, recursecall=False):
    fname=url.split('/')[-1]
    doesnt_exist = !ls | grep fname
    if doesnt_exist != 0:
        !wget {url} -q
        soup = bs(open(fname), "lxml")
        !rm {fname}
    else:
        print("already exists...")
    for series in soup.find_all('td', class_='tl'):
        link = series.find('a', href=True)
        href = link['href']
        name = link.get_text()
        print(name)
        if os.path.exists(name):
            print("Series exists, skipping")
            continue
        get_series_tracklists(series_url=root_url+href, folder=name)
    if recursecall == False:
        try:
            page_div = soup.find('ul', class_='pagination')
            other_pages = page_div.find_all('a', href=True)
            dont_follow = ['Prev', '1', 'Next']
            try:
                cur_page_num = int(url.split('/')[-1].split('.')[-2][-1])
            except:
                cur_page_num = 1
            for page in other_pages:
                try:
                    page_num = int(page.get_text())
                except:
                    continue
                if page_num > cur_page_num:
                    group = url[0:url.rfind('/')]
                    print(group)
                    try:
                        get_all_series(group+'/'+page['href'], recursecall=True)
                    except:
                        continue
        except:
            pass
            return
        
get_all_series('https://www.1001tracklists.com/groups/index.html')   

Amsterdam Dance Event 2018
Series exists, skipping
Deep Progressive Techhouse Sets
Series exists, skipping
HEXAGON HQ
Series exists, skipping
Unity Brothers Podcast by Unity Brothers
Series exists, skipping
Evan's Picks
Series exists, skipping
SPINNIN' Promo Mixes
Series exists, skipping
Tiesto Tracklists Live
Series exists, skipping
VillaHangar Music In The Air Podcast Show
Series exists, skipping
Exclusive 1001TL Mixes
Series exists, skipping
Tomorrowland 2018
Series exists, skipping
Tomorrowland Belgium 2018
Series exists, skipping
Sensation Megamixes
Series exists, skipping
Source Recordings Seasonal Mixes
Series exists, skipping
Axtone Presents
Series exists, skipping
Deep House DJ SETS - LOCAL & ABROAD
Series exists, skipping
Aftermovies
Series exists, skipping
Hard Dance
Series exists, skipping
Yearmixes 2017
Series exists, skipping
Finished DnB Tracklists
Series exists, skipping
Time Records Promo Mixes
Series exists, skipping
SIZE & Family
Series exists, skipping
Q-Dance Endsh