In [2]:
import sys
import os
import numpy as np
import pandas as pd
import string
import json
import pickle
from zipfile import ZipFile
import requests
from io import BytesIO
import time
import timeit
import re

In [355]:
def download_song(download_url, song_name, out_filename):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
               "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    done, successful = False, False
    page_string = "https://beatsaver.com{}".format(download_url)
    request_data = requests.get(page_string, headers=headers)
    requirements = []
    # Successfully got song data
    if request_data.status_code == 200:
        # Process data which is a zip file
        data_zip = BytesIO(request_data.content)
        
        # Get the key from the download url to do fast lookup in pandas df
        key = download_url.rsplit('/', 1)[-1]
        done = True
        # Using a lazy try catch so I can let it run all night without it ending randomly due to some small error
        try:
            # Open a second zip and move the good files from the original to the new one
            with ZipFile(data_zip) as folder, ZipFile('Zip_Songs_Data/{}.zip'.format(out_filename), 'w') as out_zip:
                filenames = folder.namelist()
                # Difficulties which have met our criteria
                difficulties = (maps_df.loc[maps_df['key'] == key])['difficulty'].values
                difficulties = [(diff[0].upper() + diff[1:]) for diff in difficulties]
                if "ExpertPlus" in difficulties:
                    difficulties = np.append(difficulties, "Expert+") # Old naming convention

                # Find info.dat file        
                info_files = list(filter(lambda x: re.match(r'(^(info)*\.dat$)', x, flags=re.I), filenames)) 
                if len(info_files) == 0:
                    raise Exception("No info.dat file found. Filenames: {}".format(filenames))
                
                # Add info.dat file and any difficulty .dat files which are in acceptable difficulties
                num_diff_dats = 0
                for info_file_zip in info_files: # Should only be one info file
                    out_zip.writestr(info_file_zip, folder.read(info_file_zip))
                    with folder.open(info_file_zip) as info_file:
                        info_json = json.load(info_file)
                        diff_sets = info_json['_difficultyBeatmapSets']
                        # Difficulty sets based on game type.. Only care about standard
                        for diff_set in diff_sets: 
                            if diff_set['_beatmapCharacteristicName'] != "Standard":
                                continue
                            beatmap_diffs = diff_set['_difficultyBeatmaps']
                            # Beatmaps based on difficulty in standard mode
                            for beatmap_diff in beatmap_diffs:
                                # If this difficulty map meets the criteria used in data filtering
                                if beatmap_diff['_difficulty'] in difficulties:
                                    diff_dat_file = beatmap_diff['_beatmapFilename']
                                    if diff_dat_file not in filenames: # Should never happen
                                        raise Exception("Diff dat file not found in filenames: {}. Filenames: {}".format(diff_dat_file, filenames))
                                    # Standardize the names of .dat for ease of use. Prevents names like hell.dat for expert+ ruining the data
                                    dat_file_name = "Expert.dat"
                                    if beatmap_diff['_difficulty'] == "ExpertPlus" or beatmap_diff['_difficulty'] == "Expert+":
                                        dat_file_name = "ExpertPlus.dat"
                                    out_zip.writestr(dat_file_name, folder.read(diff_dat_file))
                                    num_diff_dats += 1
                                    # Add the requirements to the dataframe
                                    try:
                                        if len(beatmap_diff['_customData']['_requirements']) >= 1:
                                            requirements.append(beatmap_diff['_customData']['_requirements'])
                                            # print("Song {} has the following requirements: {}".format(song_name, requirements))
                                    except KeyError as ke: # Some songs don't have a requirements or customdata
                                        continue


                if num_diff_dats <= 0:
                    raise Exception("No suitable difficulty dat files found.\nFilenames: {}.\nDifficulties: {}".format(filenames, difficulties))
                
                # Add cover image to output folder
                cover_imgs = list(filter(lambda x: re.match(r'(cover\.(jpg|png|jpeg)$)', x, flags=re.I), filenames)) 
                if len(cover_imgs) == 0: # Just add any png/jpg in the folder and call it cover
                    cover_imgs = list(filter(lambda x: re.match(r'(.*\.(jpg|png|jpeg))', x, flags=re.I), filenames)) 
                    # If there are no images in the song at all then use the default
                    if len(cover_imgs) == 0:
                        print("No cover image found in song {}. Filenames: ".format(song_name), filenames)
                        out_zip.write("Zip_Songs_Data/cover.jpg", "cover.jpg")
                    else:
                        out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))
                # Want to just add the first image, dont care if there is more than 1
                else:
                    out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))

                # Add song itself to output zip file
                song_files = list(filter(lambda x: re.match(r'(^.+\.(egg|ogg|mp4|mp3))', x, flags=re.I), filenames))
                if len(song_files) == 0: # Very bad, no song found in folder
                    out_zip.close()
                    raise Exception("No song file found in folder for song: {}. Filenames: {}".format(song_name, filenames))
                for song_file in song_files: # If multiple song files we'll deal with it in processing
                    out_zip.writestr(song_file, folder.read(song_file))
                
                # Make sure we have atleast one info.dat and one difficulty.dat
                out_filenames = out_zip.namelist()
                if sum('.dat' in f for f in out_filenames) <= 1:
                    out_zip.close()
                    raise Exception("Don't have atleast two dat files for song: {}. Filenames: {}".format(song_name, filenames))
                
                successful = True
                return done, successful, requirements, 0

        except Exception as e:
            print("Error {} occured when downloading song: {}. Key: {}. Total download count: {}".format(e, song_name, key, total_download_count))
            # Store the missed download in text file so we can download it later
            with open('failed_download_nums.txt', 'a') as f:
                f.write('(Num: {}, Key: {}), '.format(total_download_count, key))
            # Delete the failed download zip's file
            os.remove('Zip_Songs_Data/{}.zip'.format(out_filename))
            return done, successful, requirements, 5000
    
    # Timeout. Hit rate limit
    elif request_data.status_code == 429: 
        print("Timeout. Status code:", request_data.status_code, "Timeout len:", request_data.json()['resetAfter'])
        return done, successful, requirements, request_data.json()['resetAfter']
    # Some other error
    else:
        done = True
        print("Couldn't get song: {}. Status code: {}. Response: {}".format(song_name, request_data.status_code, request_data.content))
        return done, successful, requirements, 5000

In [356]:
def download_all_songs(maps_df, start_song=0, max_songs=50000):
    print("Starting to download {} songs starting at song {}".format(min(len(maps_df.download_URL.unique()), max_songs), start_song))
    global total_download_count
    download_count = 0
    start_count = 0

    start_time = time.time()
    for download_url in maps_df.download_URL.unique():
        if download_count > max_songs:
            break
        if start_count < start_song:
            start_count += 1
            continue
        if total_download_count % 50 == 0 and total_download_count != 0:
            print("Downloaded up to song: ", total_download_count)
        if download_count % 1000 == 0 and download_count != 0:
            curr_time = time.time()
            print("Sleeping for 60 seconds to reset timeout timer. Time elasped: {:.2f}".format(curr_time - start_time))
            time.sleep(60)

        # Get the song name using the key found in the download url
        key = download_url.rsplit('/', 1)[-1]
        song_name = (maps_df.loc[maps_df['key'] == key])['song_name'].values[0] # Key is unique so only one value
        # Determine acceptable file name given the song name
        valid_filename_chars = "-_.() %s%s" % (string.digits, string.ascii_letters)
        valid_filename = ''.join(char for char in song_name if char in valid_filename_chars)
        valid_filename = valid_filename.replace(' ','_')
        out_filename = "({})_{}".format(key, valid_filename)

        done = False
        successful = False
        # Keep attempting to download if it keeps timing out
        while not done:
            done, successful, requirements, timeout = download_song(download_url, song_name, out_filename)
            if timeout:
                # Sleep to reset timeout
                print("Sleeping for {} seconds to reset timeout timer".format((timeout / 1000) + 1))
                time.sleep((timeout / 1000) + 2)
        
        # Add path to file if successful
        if successful:
            maps_df.loc[maps_df['key'] == key, 'file_path'] = 'Zip_Songs_Data/{}.zip'.format(out_filename)
            if len(requirements) >= 1:
                maps_df.loc[maps_df['key'] == key, 'requirements'] = ' '.join([str(req) for req in requirements])
        download_count += 1
        total_download_count += 1
    end_time = time.time()
    print("Time taken to download: {:.2f} seconds".format(end_time - start_time))
    print("Number of songs:", download_count)

In [363]:
# Get maps dataframe back from the pickle file
maps_df = pd.read_pickle("maps_df.pkl")

# Global variable which keeps track of number of sounds downloaded so far
total_download_count = 0

In [364]:
# Download all the songs which get saved into .zip files in the Zip_Song_Data directory
download_all_songs(maps_df, start_song=total_download_count)

Starting to download 12557 songs starting at song 0
Downloaded up to song:  50
Downloaded up to song:  100
Downloaded up to song:  150
Downloaded up to song:  200
Downloaded up to song:  250
Downloaded up to song:  300
Downloaded up to song:  350
Downloaded up to song:  400
Downloaded up to song:  450
Downloaded up to song:  500
Downloaded up to song:  550
Downloaded up to song:  600
Downloaded up to song:  650
Downloaded up to song:  700
Downloaded up to song:  750
Downloaded up to song:  800
Downloaded up to song:  850
Downloaded up to song:  900
Downloaded up to song:  950
Error No suitable difficulty dat files found.
Filenames: ['cover.jpg', 'Expert.dat', 'info.dat', 'TheFatRat - Oblivion Feat. Lola Blanc.egg'].
Difficulties: ['ExpertPlus' 'Expert+'] occured when downloading song: TheFatRat - Oblivion Feat. Lola Blanc. Key: 6087. Total download count: 996
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  1000
Sleeping for 60 seconds to reset timeout timer. Tim

  return self._open_to_write(zinfo, force_zip64=force_zip64)


Couldn't get song: Berry Pop. Status code: 404. Response: b'Not Found'
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  4250
Downloaded up to song:  4300
Downloaded up to song:  4350
Downloaded up to song:  4400
Downloaded up to song:  4450


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  4500
Downloaded up to song:  4550
Downloaded up to song:  4600
Downloaded up to song:  4650
Downloaded up to song:  4700
Downloaded up to song:  4750
Downloaded up to song:  4800
Downloaded up to song:  4850
Downloaded up to song:  4900
Downloaded up to song:  4950
Downloaded up to song:  5000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 10248.61
Downloaded up to song:  5050
Couldn't get song: Icecore & Kou! - Powerful. Status code: 404. Response: b'Not Found'
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  5100
Downloaded up to song:  5150
Downloaded up to song:  5200
Downloaded up to song:  5250


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  5300
Downloaded up to song:  5350
Downloaded up to song:  5400
Downloaded up to song:  5450
Timeout. Status code: 429 Timeout len: 5205477
Sleeping for 5206.477 seconds to reset timeout timer
Downloaded up to song:  5500
Downloaded up to song:  5550
Downloaded up to song:  5600
Downloaded up to song:  5650
Downloaded up to song:  5700
Downloaded up to song:  5750
Downloaded up to song:  5800
Downloaded up to song:  5850
Downloaded up to song:  5900
Downloaded up to song:  5950
Downloaded up to song:  6000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 16468.35
Downloaded up to song:  6050
Downloaded up to song:  6100
Couldn't get song: Camellia & DJ Genki - Feelin Sky (Camellia's "200step" Self-remix). Status code: 404. Response: b'Not Found'
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  6150
Downloaded up to song:  6200
Downloaded up to song:  6250
Downloaded up to song:  6300
Couldn't get song: Tom & Jame - Get Get Down

  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  7500
Downloaded up to song:  7550
Downloaded up to song:  7600
Downloaded up to song:  7650
Downloaded up to song:  7700
Downloaded up to song:  7750
Downloaded up to song:  7800
Downloaded up to song:  7850
Downloaded up to song:  7900
Downloaded up to song:  7950
Downloaded up to song:  8000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 23630.31
Downloaded up to song:  8050
Error No suitable difficulty dat files found.
Filenames: ['Expert.dat', 'FGC.egg', 'FGC.jpg', 'info.dat'].
Difficulties: ['ExpertPlus' 'Expert+'] occured when downloading song: Flex Glue Clear. Key: 5fb9. Total download count: 8070
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  8100
Downloaded up to song:  8150
Downloaded up to song:  8200
Downloaded up to song:  8250
Downloaded up to song:  8300
Downloaded up to song:  8350
Downloaded up to song:  8400
Downloaded up to song:  8450
Downloaded up to song:  8500
Downloaded up to song:  8550
Downloaded 

  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  8850
Downloaded up to song:  8900
Downloaded up to song:  8950
Downloaded up to song:  9000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 24598.75
Downloaded up to song:  9050
Downloaded up to song:  9100
Downloaded up to song:  9150
Downloaded up to song:  9200
Downloaded up to song:  9250
Downloaded up to song:  9300
Downloaded up to song:  9350
Downloaded up to song:  9400
Downloaded up to song:  9450
Timeout. Status code: 429 Timeout len: 5279720
Sleeping for 5280.72 seconds to reset timeout timer
Downloaded up to song:  9500
Downloaded up to song:  9550
Downloaded up to song:  9600
Downloaded up to song:  9650
Downloaded up to song:  9700
Downloaded up to song:  9750
Downloaded up to song:  9800
Downloaded up to song:  9850
Downloaded up to song:  9900
Downloaded up to song:  9950
Downloaded up to song:  10000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 30859.98
Error No suitable difficulty dat files found.
Filenames: ['cov

  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  10450
Downloaded up to song:  10500
Downloaded up to song:  10550
Downloaded up to song:  10600
Downloaded up to song:  10650
Downloaded up to song:  10700
Downloaded up to song:  10750
Downloaded up to song:  10800
Downloaded up to song:  10850
Downloaded up to song:  10900


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  10950
Downloaded up to song:  11000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 31858.58
Downloaded up to song:  11050
Downloaded up to song:  11100
Downloaded up to song:  11150
Downloaded up to song:  11200
Downloaded up to song:  11250
Downloaded up to song:  11300
Downloaded up to song:  11350
Downloaded up to song:  11400
Downloaded up to song:  11450
Timeout. Status code: 429 Timeout len: 5194628
Sleeping for 5195.628 seconds to reset timeout timer
Downloaded up to song:  11500
Downloaded up to song:  11550
Downloaded up to song:  11600
Downloaded up to song:  11650
Downloaded up to song:  11700
Downloaded up to song:  11750
Downloaded up to song:  11800


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  11850


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  11900
Downloaded up to song:  11950
Downloaded up to song:  12000
Sleeping for 60 seconds to reset timeout timer. Time elasped: 38060.85
Downloaded up to song:  12050


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  12100
Downloaded up to song:  12150
Downloaded up to song:  12200


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  12250
Downloaded up to song:  12300
Downloaded up to song:  12350
Downloaded up to song:  12400
Downloaded up to song:  12450


  return self._open_to_write(zinfo, force_zip64=force_zip64)


Downloaded up to song:  12500
Downloaded up to song:  12550
Time taken to download: 38647.25 seconds
Number of songs: 12557


In [365]:
# RUN AFTER YOU'VE DOWNLOADED ALL SONGS YOU WANT
maps_df = maps_df[maps_df['file_path'] != "NOT_FOUND"]

# Save the updated dataframe in its pickle file
maps_df.to_pickle("downloaded_maps_df.pkl")

In [91]:
test_dir = "D:\Oculus\Games\Software\hyperbolic-magnetism-beat-saber\Beat Saber_Data\CustomLevels"
names = {''}
for subdir, dirs, files in os.walk(test_dir):
    for file in files:
        if not file.endswith(".egg") and not file.endswith(".jpg"):
            names.add(file.rsplit('.', 1)[-1])

print(names)

{'', 'json', 'png', 'jpeg', 'dat', 'mp3', 'xmp', 'PNG', 'mp4', 'ogg', 'exe', 'sfk'}
