In [341]:
import sys
import os
import numpy as np
import pandas as pd
import string
import json
import pickle
import zipfile
import requests
from io import BytesIO
import time
import timeit
import re

In [342]:
def download_song(download_url, song_name, out_filename):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
               "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    done, successful = False, False
    page_string = "https://beatsaver.com{}".format(download_url)
    request_data = requests.get(page_string, headers=headers)
    requirements = []
    # Successfully got song data
    if request_data.status_code == 200:
        # Process data which is a zip file
        data_zip = BytesIO(request_data.content)
        
        # Get the key from the download url to do fast lookup in pandas df
        key = download_url.rsplit('/', 1)[-1]
        done = True
        # Using a lazy try catch so I can let it run all night without it ending randomly due to some small error
        try:
            # Open a second zip and move the good files from the original to the new one
            with ZipFile(data_zip) as folder, ZipFile('Zip_Songs_Data/{}.zip'.format(out_filename), 'w') as out_zip:
                filenames = folder.namelist()
                # Difficulties which have met our criteria
                difficulties = (maps_df.loc[maps_df['key'] == key])['difficulty'].values
                difficulties = [(diff[0].upper() + diff[1:]) for diff in difficulties]
                if "ExpertPlus" in difficulties:
                    difficulties = np.append(difficulties, "Expert+") # Old naming convention

                # Find info.dat file        
                info_files = list(filter(lambda x: re.match(r'(^(info)*\.dat$)', x, flags=re.I), filenames)) 
                if len(info_files) == 0:
                    raise Exception("No info.dat file found. Filenames: {}".format(filenames))
                
                # Add info.dat file and any difficulty .dat files which are in acceptable difficulties
                num_diff_dats = 0
                for info_file_zip in info_files: # Should only be one info file
                    out_zip.writestr(info_file_zip, folder.read(info_file_zip))
                    with folder.open(info_file_zip) as info_file:
                        info_json = json.load(info_file)
                        diff_sets = info_json['_difficultyBeatmapSets']
                        # Difficulty sets based on game type.. Only care about standard
                        for diff_set in diff_sets: 
                            if diff_set['_beatmapCharacteristicName'] != "Standard":
                                continue
                            beatmap_diffs = diff_set['_difficultyBeatmaps']
                            # Beatmaps based on difficulty in standard mode
                            for beatmap_diff in beatmap_diffs:
                                # If this difficulty map meets the criteria used in data filtering
                                if beatmap_diff['_difficulty'] in difficulties:
                                    diff_dat_file = beatmap_diff['_beatmapFilename']
                                    if diff_dat_file not in filenames: # Should never happen
                                        raise Exception("Diff dat file not found in filenames: {}. Filenames: {}".format(diff_dat_file, filenames))
                                    # Standardize the names of .dat for ease of use. Prevents names like hell.dat for expert+ ruining the data
                                    dat_file_name = "Expert.dat"
                                    if beatmap_diff['_difficulty'] == "ExpertPlus" or beatmap_diff['_difficulty'] == "Expert+":
                                        dat_file_name = "ExpertPlus.dat"
                                    out_zip.writestr(dat_file_name, folder.read(diff_dat_file))
                                    num_diff_dats += 1
                                    # Add the requirements to the dataframe
                                    try:
                                        if len(beatmap_diff['_customData']['_requirements']) >= 1:
                                            requirements.append(beatmap_diff['_customData']['_requirements'])
                                            # print("Song {} has the following requirements: {}".format(song_name, requirements))
                                    except KeyError as ke: # Some songs don't have a requirements or customdata
                                        continue


                if num_diff_dats <= 0:
                    raise Exception("No suitable difficulty dat files found.\nFilenames: {}.\nDifficulties: {}".format(filenames, difficulties))
                
                # Add cover image to output folder
                cover_imgs = list(filter(lambda x: re.match(r'(cover\.(jpg|png|jpeg)$)', x, flags=re.I), filenames)) 
                if len(cover_imgs) == 0: # Just add any png/jpg in the folder and call it cover
                    cover_imgs = list(filter(lambda x: re.match(r'(.*\.(jpg|png|jpeg))', x, flags=re.I), filenames)) 
                    # If there are no images in the song at all then use the default
                    if len(cover_imgs) == 0:
                        print("No cover image found in song {}. Filenames: ".format(song_name), filenames)
                        out_zip.write("Zip_Songs_Data/cover.jpg", "cover.jpg")
                    else:
                        out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))
                # Want to just add the first image, dont care if there is more than 1
                else:
                    out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))

                # Add song itself to output zip file
                song_files = list(filter(lambda x: re.match(r'(^.+\.(egg|ogg|mp4|mp3))', x, flags=re.I), filenames))
                if len(song_files) == 0: # Very bad, no song found in folder
                    out_zip.close()
                    raise Exception("No song file found in folder for song: {}. Filenames: {}".format(song_name, filenames))
                for song_file in song_files: # If multiple song files we'll deal with it in processing
                    out_zip.writestr(song_file, folder.read(song_file))
                
                # Make sure we have atleast one info.dat and one difficulty.dat
                out_filenames = out_zip.namelist()
                if sum('.dat' in f for f in out_filenames) <= 1:
                    out_zip.close()
                    raise Exception("Don't have atleast two dat files for song: {}. Filenames: {}".format(song_name, filenames))
                
                successful = True
                return done, successful, requirements, 0

        except Exception as e:
            print("Error {} occured when downloading song: {}. Key: {}. Total download count: {}".format(e, song_name, key, total_download_count))
            # Store the missed download in text file so we can download it later
            with open('failed_download_nums.txt', 'a') as f:
                f.write('(Num: {}, Key: {}), '.format(total_download_count, key))
            # Delete the failed download zip's file
            os.remove('Zip_Songs_Data/{}.zip'.format(out_filename))
            return done, successful, requirements, 5000
    
    # Timeout. Hit rate limit
    elif request_data.status_code == 429: 
        print("Timeout. Status code:", request_data.status_code, "Timeout len:", request_data.json()['resetAfter'])
        return done, successful, requirements, request_data.json()['resetAfter']
    # Some other error
    else:
        done = True
        print("Couldn't get song: {}. Status code: {}. Response: {}".format(song_name, request_data.status_code, request_data.content))
        return done, successful, requirements, 5000

In [343]:
def download_all_songs(maps_df, start_song=0, max_songs=50000):
    print("Starting to download {} songs starting at song {}".format(min(len(maps_df.download_URL.unique()), max_songs), start_song))
    global total_download_count
    download_count = 0
    start_count = 0

    start_time = time.time()
    for download_url in maps_df.download_URL.unique():
        if download_count > max_songs:
            break
        if start_count < start_song:
            start_count += 1
            continue
        if total_download_count % 50 == 0 and total_download_count != 0:
            print("Downloaded up to song: ", total_download_count)
        if download_count % 1000 == 0 and download_count != 0:
            curr_time = time.time()
            print("Sleeping for 60 seconds to reset timeout timer. Time elasped: {:.2f}".format(curr_time - start_time))
            time.sleep(60)

        # Get the song name using the key found in the download url
        key = download_url.rsplit('/', 1)[-1]
        song_name = (maps_df.loc[maps_df['key'] == key])['song_name'].values[0] # Key is unique so only one value
        # Determine acceptable file name given the song name
        valid_filename_chars = "-_.() %s%s" % (string.digits, string.ascii_letters)
        valid_filename = ''.join(char for char in song_name if char in valid_filename_chars)
        valid_filename = valid_filename.replace(' ','_')
        out_filename = "({})_{}".format(key, valid_filename)

        done = False
        successful = False
        # Keep attempting to download if it keeps timing out
        while not done:
            done, successful, requirements, timeout = download_song(download_url, song_name, out_filename)
            if timeout:
                # Sleep to reset timeout
                print("Sleeping for {} seconds to reset timeout timer".format((timeout / 1000) + 1))
                time.sleep((timeout / 1000) + 2)
        
        # Add path to file if successful
        if successful:
            maps_df.loc[maps_df['key'] == key, 'file_path'] = 'Zip_Songs_Data/{}.zip'.format(out_filename)
            if len(requirements) >= 1:
                maps_df.loc[maps_df['key'] == key, 'requirements'] = ' '.join([str(req) for req in requirements])
        download_count += 1
        total_download_count += 1
    end_time = time.time()
    print("Time taken to download: {:.2f} seconds".format(end_time - start_time))
    print("Number of songs:", download_count)

In [344]:
# Get maps dataframe back from the pickle file
maps_df = pd.read_pickle("maps_df.pkl")

# Global variable which keeps track of number of sounds downloaded so far
total_download_count = 0

In [None]:
# Download all the songs which get saved into .zip files in the Zip_Song_Data directory
download_all_songs(maps_df, start_song=total_download_count, max_songs=1000)

In [None]:
# RUN AFTER YOU'VE DOWNLOADED ALL SONGS YOU WANT
maps_df = maps_df[maps_df['file_path'] != "NOT_FOUND"]

# Save the updated dataframe in its pickle file
maps_df.to_pickle("downloaded_maps_df.pkl")

In [268]:
# Update the dataframe to store the path of the song's zip file or delete from dataframe if not found
def add_key_to_file_name(maps_df): # This is a fix for a mistake. I shouldve done this from the start
    count = 0
    opt = []
    
    for subdir, dirs, files in os.walk('Zip_Songs_Data'):
        for file in files:
            # try:
            with ZipFile('Zip_Songs_Data/{}'.format(file)) as folder:
                # folder.printdir()
                filenames = folder.namelist()
                info_files = list(filter(lambda x: re.match(r'(^(info)*\.dat$)', x, flags=re.I), filenames)) 
                for info_file in info_files:
                    with folder.open(info_file) as f:
                        # try:
                        file_json = json.load(f)
                        # print(json.dumps(file_json, indent=2))
                        diff_sets = file_json['_difficultyBeatmapSets']
                        for diff_set in diff_sets:
                            if diff_set['_beatmapCharacteristicName'] != "Standard":
                                continue
                            diff_beatmaps = diff_set['_difficultyBeatmaps']
                            for diff_beatmap in diff_beatmaps:
                                print(diff_beatmap)
                                print(diff_beatmap['_customData']['_requirements'])
                                print(diff_beatmap['_difficulty'])
                                # opt.append(diff_beatmap['_difficulty'])
                                # requirements.append(diff_beatmap['_requirements'])
                        count += 1
                        # except Exception as e:
                        #     print("{}. Couldn't open {} as JSON in file {}".format(e, info_file, file))
                        #     break
            # except:
            #     continue
    
    # print(set(opt))
    # print(set(requirements))

In [91]:
test_dir = "D:\Oculus\Games\Software\hyperbolic-magnetism-beat-saber\Beat Saber_Data\CustomLevels"
names = {''}
for subdir, dirs, files in os.walk(test_dir):
    for file in files:
        if not file.endswith(".egg") and not file.endswith(".jpg"):
            names.add(file.rsplit('.', 1)[-1])

print(names)

{'', 'json', 'png', 'jpeg', 'dat', 'mp3', 'xmp', 'PNG', 'mp4', 'ogg', 'exe', 'sfk'}
