In [1]:
import requests
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
import re
import json
import pandas as pd
import time
import numpy as np
requests.adapters.DEFAULT_RETRIES = 20

In [2]:
def simple_get(url):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                with closing(get(url, stream=True)) as resp:
                    if is_good_response(resp):
                        return resp.content
                    return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

def get_tabs_data(url):
    """
    Creates the bs4 object and extracts a list
    of tab info. Hits info is stored as a separate 
    list in the html file so it is returned separately.
    """
    
    data = simple_get(url)
    data = data.decode("utf-8")
    
    print(data)
    
    tab_links = []

    m = re.findall('(https://tabs.ultimate-guitar.com/tab/\S+?)&', data)
    if m:
        tab_links.append(m)

    return(tab_links, [])

def get_chords(url):
    """
    Scrapes and returns the sequences of 
    chords as a list as well as the fret number
    to place a capo. 
    """

    data = simple_get(url)
    data = data.decode("utf-8")

    # Matching groups (open tag)(chord pitch)(base note {0 or 1})(chord type)(base note {0 or 1})(closing tag)
    pattern = "(\[ch\])([A-G]+)(\/[A-G]*[b#])*([(?m)|(?m\d)|(?b\d)|(?#\d)|(?maj\d)|(?add\d)|(?sus\d)|(?aug)|(?aug\d)|(?dim)|(?dim\d)]*)(\/[A-G]*[b#])*(\[\/ch\])"
    prog = re.compile(pattern)
    result = prog.findall(data)
    
    cleaned_res = result
    for i in range(len(result)):
        # Grabbing groups (chord pitch)(base note)(chord type)(base note)
        cleaned_res[i] = result[i][1] + result[i][2] + result[i][3] + result[i][4]
       
    # Grabbing Capo info
    capo = 0
    pattern = "&quot;capo&quot;:(\d)"
    result = re.search(pattern, data)
    if result:
        capo = result.group(1)
    
    artist = ''
    pattern = "artist_name&quot;:&quot;(.*?)&"
    result = re.search(pattern, data)
    if result:
        artist = result.group(1)
    
    song = ''
    pattern = "song_name&quot;:&quot;(.*?)&"
    result = re.search(pattern, data)
    if result:
        song = result.group(1)
        
    return(artist, song, cleaned_res, capo)
    
def get_genre(url):
    """
    Grabs the artist's categorized genre
    """
    data = get_data(url)
    genre = data['data']['artist']['genre']
    
    return(genre)


def get_multiple_pages(url):
    cur_tabs, cur_hits = get_tabs_data(url)
        
    tabs_list = cur_tabs[0]
    hits_list = cur_hits
        
    return(tabs_list, hits_list)
   

In [3]:
tabs, hits = get_multiple_pages("https://www.ultimate-guitar.com/search.php?search_type=title&value=israel%20over%20the%20rainbow")


<!doctype html>
<html lang="en">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
<script>
    !function(n,e){var t,o,i,c=[],f={passive:!0,capture:!0},r=new Date,a="pointerup",u="pointercancel";function p(n,c){t||(t=c,o=n,i=new Date,w(e),s())}function s(){o>=0&&o<i-r&&(c.forEach(function(n){n(o,t)}),c=[])}function l(t){if(t.cancelable){var o=(t.timeStamp>1e12?new Date:performance.now())-t.timeStamp;"pointerdown"==t.type?function(t,o){function i(){p(t,o),r()}function c(){r()}function r(){e(a,i,f),e(u,c,f)}n(a,i,f),n(u,c,f)}(o,t):p(o,t)}}function w(n){["click","mousedown","keydown","touchstart","pointerdown"].forEach(function(e){n(e,l,f)})}w(n),self.perfMetrics=self.perfMetrics||{},self.perfMetrics.onFirstInputDelay=function(n){c.push(n),s()}}(addEventListener,removeEventListener);
</script>

<link rel="preload" href="https://www.ultimate-guitar.com/static/public/build/ug_react/vendor.1e5a52902a3beb11c6c1610702588dd3.js" as="script" />
<l

In [4]:
tabs

['https://tabs.ultimate-guitar.com/tab/868611',
 'https://tabs.ultimate-guitar.com/tab/1460351',
 'https://tabs.ultimate-guitar.com/tab/2135261',
 'https://tabs.ultimate-guitar.com/tab/2308517',
 'https://tabs.ultimate-guitar.com/tab/2383241',
 'https://tabs.ultimate-guitar.com/tab/2662257',
 'https://tabs.ultimate-guitar.com/tab/2988401',
 'https://tabs.ultimate-guitar.com/tab/3173657',
 'https://tabs.ultimate-guitar.com/tab/2558922',
 'https://tabs.ultimate-guitar.com/tab/1092318',
 'https://tabs.ultimate-guitar.com/tab/3267302',
 'https://tabs.ultimate-guitar.com/tab/2070813',
 'https://tabs.ultimate-guitar.com/tab/1099939',
 'https://tabs.ultimate-guitar.com/tab/1978963',
 'https://tabs.ultimate-guitar.com/tab/1486181',
 'https://tabs.ultimate-guitar.com/tab/2110513',
 'https://tabs.ultimate-guitar.com/tab/1274703',
 'https://tabs.ultimate-guitar.com/tab/156298',
 'https://tabs.ultimate-guitar.com/tab/157459',
 'https://tabs.ultimate-guitar.com/tab/685900',
 'https://tabs.ultimate-

In [5]:
df = pd.DataFrame(columns=['Artist', 'Song', 'Chords', 'Capo'])
artist, song, chords, capo = get_chords(tabs[0])
df.loc[0] = [artist, song, chords, capo]
    
df 

Unnamed: 0,Artist,Song,Chords,Capo
0,Israel Kamakawiwoʻole,Over The Rainbow,"[G, D, Em, C, Cadd9, B7, G, D, Em, C, G, D, Em...",5


In [6]:
np.unique(df['Chords'].iloc[0])

array(['B7', 'C', 'Cadd9', 'D', 'Em', 'G'], dtype='<U5')