In [40]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
import regex as re

Scraping code from https://github.com/x4nth055/pythoncode-tutorials/tree/master/web-scraping/link-extractor adapted for our use.

In [18]:
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET

In [19]:
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

In [20]:
def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [26]:
def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
    
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)

        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                print(f"{GRAY}[!] External link: {href}{RESET}")
                external_urls.add(href)
            continue

        print(f"{GREEN}[*] Internal link: {href}{RESET}")
        urls.add(href)
        internal_urls.add(href)
    
    return urls

In [28]:
# number of urls visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=1000):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

In [27]:
# we want the links from here, since it's only piano music
url = 'https://www.vgmusic.com/music/other/miscellaneous/piano/'

In [29]:
crawl(url)

[*] Internal link: http://www.vgmusic.com/information/donate.php
[*] Internal link: https://www.vgmusic.com/
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/dotHack_Loop2.mid
[*] Internal link: https://www.vgmusic.com/file/c5303ca449b42c033b965d782f79d036.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/WorldRunner.mid
[*] Internal link: https://www.vgmusic.com/file/3e40cbc8bd18cdbe1cce637191b2ad4c.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/7Church.mid
[*] Internal link: https://www.vgmusic.com/file/2da0227c706a81b91762026c6c6d704e.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/act_raiser-north_wall.mid
[*] Internal link: https://www.vgmusic.com/file/6e11c438b213173c01613169efacdd36.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/EAGLE.mid
[*] Internal link: https://www.vgmusic.com/file/25c23423d7964cec931a97fe12a2b41b.html
[*] 

[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Rugged_Ridge.mid
[*] Internal link: https://www.vgmusic.com/file/5e9dad592a9523bbde634b81845baab6.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/DDRSo_Deep_Piano.mid
[*] Internal link: https://www.vgmusic.com/file/339b489917367a82259e7b7146059419.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/mkingdom.mid
[*] Internal link: https://www.vgmusic.com/file/18a3a98f13c7202a3d6c01f9cc8c5f7b.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Level1piano.mid
[*] Internal link: https://www.vgmusic.com/file/0b957e598dc6bc021f80b62aca84c77f.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/DKONG.mid
[*] Internal link: https://www.vgmusic.com/file/b108c2f12e30db281705a2670eed9963.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/dk_title.mid
[*] Internal link: https://www

[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/edc.mid
[*] Internal link: https://www.vgmusic.com/file/ca12f18b14ec82732f1099dda88e5000.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/fortresscondor.mid
[*] Internal link: https://www.vgmusic.com/file/74bcc55e9d0551e5b4e8984f69bd1ef8.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/goldsaucer.mid
[*] Internal link: https://www.vgmusic.com/file/e4042cd97c14f628b9206f8ade809b44.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/great_war.mid
[*] Internal link: https://www.vgmusic.com/file/6df4b6f8d8c9447d60ec59ec04c2cf06.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/thoughts.mid
[*] Internal link: https://www.vgmusic.com/file/814a812e714dc5682bf174c07dfeaaa8.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/sobf.mid
[*] Internal link: https://www.vgmusic.com

[*] Internal link: https://www.vgmusic.com/file/8c84d78a4aebdd4b52972e33709b1cfd.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Overture_WIP.mid
[*] Internal link: https://www.vgmusic.com/file/ebae3ad33b11e7cbc01bfe0bb4e5ed5e.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Kid_Icarus.mid
[*] Internal link: https://www.vgmusic.com/file/065cb67cb597703b406782f809ee2ea5.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/xb_kh_endworldbattle_p.mid
[*] Internal link: https://www.vgmusic.com/file/25d105c7601d49ec822fc2abb33300fb.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Kingdom_Hearts_Dearly_Beloved.mid
[*] Internal link: https://www.vgmusic.com/file/fb1fd98259a304bbdaf038c63ad21e2a.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/hikaripiano.mid
[*] Internal link: https://www.vgmusic.com/file/fe81c2e5bc4bac0d17f9d8f9f1971874.html


[*] Internal link: https://www.vgmusic.com/file/c5d3e990ba04ef52f3b802980fe4f6a7.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/lom_gatosolo.mid
[*] Internal link: https://www.vgmusic.com/file/618925de6d25775870e2b44412162b2c.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/lom_daedelsolo.mid
[*] Internal link: https://www.vgmusic.com/file/3516042483348005b92bd2bed108793b.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/lom_drseedsolo.mid
[*] Internal link: https://www.vgmusic.com/file/4f8767e75d67917fe3632cf90d03f167.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/lom_dominasolo.mid
[*] Internal link: https://www.vgmusic.com/file/dd30eef884ae0b432ebad5a5255a7c5b.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/lom_jousolo.mid
[*] Internal link: https://www.vgmusic.com/file/25bd0a395f500ae99bed8c75ff585165.html
[*] Internal link: 

[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Ghosts_of_the_Past.mid
[*] Internal link: https://www.vgmusic.com/file/260d105d2862879b514c94871aa93020.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/POGetUpAS.mid
[*] Internal link: https://www.vgmusic.com/file/b50e11101e921ed0637d3d9dba5f55ce.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/punch.mid
[*] Internal link: https://www.vgmusic.com/file/ee43eb2789518518527252928f351c8f.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Mirrors_edge_piano_still_alive.mid
[*] Internal link: https://www.vgmusic.com/file/52287aff54184284a71e403e87183ffb.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Embassy_Function_GM.mid
[*] Internal link: https://www.vgmusic.com/file/eb57edd71c5362d472828373daf4a781.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Mother_Bein_Fr

[*] Internal link: https://www.vgmusic.com/file/4d75cd3a58ebb42d495ff941e97b2f00.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/strange_tale.mid
[*] Internal link: https://www.vgmusic.com/file/12098b332a6250f54588240cf22281d2.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/holy_intruder.mid
[*] Internal link: https://www.vgmusic.com/file/3b82dfd6269bdc21fcf8e50dbd1b7445.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Seiken_Densetsu_3_Decision_Bell.mid
[*] Internal link: https://www.vgmusic.com/file/78d1a103ea63299cea4e48524f69ebdc.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/sacrifice3piano.mid
[*] Internal link: https://www.vgmusic.com/file/a9410ed824b77fc0b61ded96d5e9f186.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/SotC_The_Opened_Way.mid
[*] Internal link: https://www.vgmusic.com/file/d31391c6dfc9b057b1251bde80c69ea3

[*] Internal link: https://www.vgmusic.com/file/b74e72a983f785031befe92cca5bc606.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/SMB3_Toad.mid
[*] Internal link: https://www.vgmusic.com/file/0dfaa262655a46d94cbd7c9413321d91.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/battlerock.mid
[*] Internal link: https://www.vgmusic.com/file/876f864252dcf098549e3a885312f00d.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/buoybase.mid
[*] Internal link: https://www.vgmusic.com/file/38fa155db47de2aa4f4fa78bb7054c35.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/good_egg_galaxy_piano.mid
[*] Internal link: https://www.vgmusic.com/file/8212e2ce01ccceb808e57d474211a973.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Obseravtory.mid
[*] Internal link: https://www.vgmusic.com/file/189cfe071373e35f14f9cf512b1cdb3e.html
[*] Internal link: https:

[*] Internal link: https://www.vgmusic.com/file/3b86effc3704d4479830e673efb5c94a.html
[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Zombie_Panic.mid
[*] Internal link: https://www.vgmusic.com/file/3a48d18ca2633c65431924675c8ca316.html
[*] Internal link: http://www.vgmusic.com/


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


[!] External link: http://disqus.com/
[!] External link: https://disqus.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


[*] Internal link: https://www.vgmusic.com/music/other/miscellaneous/piano/Europe(piano).mid


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

In [41]:
links = [i for i in internal_urls if i[-3:] == 'mid']

In [49]:
def getFileName(link):
    link.split('/')[::-1][0]
    
    return filename

def download_midi(url):
    filename = url.split('/')[::-1][0]
    mid_file = requests.get(url, stream=True)
    with open('..\\..\\data\\videogame_midi\\' + filename, 'wb') as saveMidFile:
        saveMidFile.write(mid_file.content)
        print('Downloaded {} successfully.'.format(filename))

In [53]:
for link in links:
    download_midi(link)

Downloaded Rock_the_Mic_-_piano.mid successfully.
Downloaded MNG-tosasolo.mid successfully.
Downloaded BKendsolo.mid successfully.
Downloaded BT_Character_Parade_Piano.mid successfully.
Downloaded ValleyWhereTheWindIsBorn.mid successfully.
Downloaded darkearthsolo.mid successfully.
Downloaded lurk_in_dark.mid successfully.
Downloaded oh_light.mid successfully.
Downloaded tomahawk.mid successfully.
Downloaded BlueStone_Zelda_Church.mid successfully.
Downloaded Rachel_Piano_tempofix.mid successfully.
Downloaded Thosechosenby_the_planet.mid successfully.
Downloaded DDRSo_Deep_Piano.mid successfully.
Downloaded Hit_the_Targets.mid successfully.
Downloaded CurseoftheDeadShips-HowardDrossin.mid successfully.
Downloaded FF3_Battle_%28Piano%29.mid successfully.
Downloaded Kalimari_Desert_MK64.mid successfully.
Downloaded Overture_WIP.mid successfully.
Downloaded 07bubbleman.mid successfully.
Downloaded surf-1.mid successfully.
Downloaded festivalsolo.mid successfully.
Downloaded ff9-battle-pia

Downloaded ff2japantown.mid successfully.
Downloaded dino_riki_1.mid successfully.
Downloaded B-K_RBBAquatic_Piano.mid successfully.
Downloaded simc-toensolo.mid successfully.
Downloaded lom_pastoralsolo.mid successfully.
Downloaded Piano_Only_Hyrule_Temple.mid successfully.
Downloaded intro.mid successfully.
Downloaded magmail.mid successfully.
Downloaded hyrule.mid successfully.
Downloaded DKCDKQ-snakesolo.mid successfully.
Downloaded The_Story_Is_Over.mid successfully.
Downloaded BaldursGate-Safe_in_Beregost.mid successfully.
Downloaded batman_megadrive_flugelheim_museum_piano.mid successfully.
Downloaded KBB_-_Game_Over_Piano_Waltz.mid successfully.
Downloaded ff4-town.mid successfully.
Downloaded WWIntroPiano.mid successfully.
Downloaded gravityman.mid successfully.
Downloaded Oakvale.mid successfully.
Downloaded Ed_Foley--BTFM_Simple.mid successfully.
Downloaded BanjoKazooie.mid successfully.
Downloaded Sarias_Song_piano.mid successfully.
Downloaded Peril_piano.mid successfully.
