In [169]:
import sys
import os
import numpy as np
import pandas as pd
import string
import json
import pickle
import zipfile
import requests
from io import BytesIO
import time
import timeit
import re

In [184]:
def download_song(download_url, song_name):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
               "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    done = False
    page_string = "https://beatsaver.com{}".format(download_url)
    request_data = requests.get(page_string, headers=headers)

    # Successfully got song data
    if request_data.status_code == 200:
        # Process data which is a zip file
        data_zip = BytesIO(request_data.content)
        valid_filename_chars = "-_.() %s%s" % (string.digits, string.ascii_letters)
        out_filename = ''.join(char for char in song_name if char in valid_filename_chars)
        out_filename = out_filename.replace(' ','_')
        done = True
        # Using a lazy try catch so I can let it run all night without it ending randomly due to some small error
        try:
            # We need to open a second zip files since you cant delete files from a zip file without extracting
            with ZipFile(data_zip) as folder, ZipFile('Zip_Songs_Data/{}.zip'.format(out_filename), 'w') as out_zip:
                filenames = folder.namelist()
                # Get the key from the download url to do fast lookup in pandas df
                key = download_url.rsplit('/', 1)[-1]
                difficulties = (maps_df.loc[maps_df['key'] == key])['difficulty'].values
                expert_files, ex_plus_files = [], []
                # If there is expertplus then we want to save the expertplus .dat file
                if 'expertPlus' in difficulties:
                    # Find any valid expert plus data file. re.I means ignore case
                    ex_plus_files = list(filter(lambda x: re.match(r'((standard)*_?expert_?plus_?(standard)*\.dat$)', x, flags=re.I), filenames)) 
                    if len(ex_plus_files) > 1: # Shouldn't happen
                        print("More than one expert plus match. Fix your regex dumbass. Filenames: ", ex_plus_files)
                    elif len(ex_plus_files) == 1:
                        out_zip.writestr(ex_plus_files[0], folder.read(ex_plus_files[0]))
                
                # If there is expert then we want to save the expert .dat file
                if 'expert' in difficulties:
                    # Find any valid expert data file. re.I means ignore case
                    expert_files = list(filter(lambda x: re.match(r'((standard)*_?expert_?(standard)*\.dat$)', x, flags=re.I), filenames)) 
                    if len(expert_files) > 1: # Shouldn't happen
                        print("More than one expert match. Fix your regex dumbass. Filenames: ", expert_files)
                    elif len(expert_files) == 1:
                        out_zip.writestr(expert_files[0], folder.read(expert_files[0]))
                
                if len(expert_files) == 0 and len(ex_plus_files) == 0:
                    # The author must be naming the .dat stupid names so we'll add as long as there is less than or equal to 2
                    dat_files = list(filter(lambda x: re.match(r'((?!info|metadata).*\.dat$)', x, flags=re.I), filenames)) 
                    if len(dat_files) <= 2:
                        for dat_file in dat_files:
                            out_zip.writestr(dat_file, folder.read(dat_file))
                    else:
                        out_zip.close()
                        raise Exception("Can't find less than 2 correctly named dat files: {}. Filenames: {}".format(song_name, filenames))

                # Add any info.dat or metadata.dat files
                info_files = list(filter(lambda x: re.match(r'(^(info|metadata)*\.dat$)', x, flags=re.I), filenames)) 
                for info_file in info_files:
                    out_zip.writestr(info_file, folder.read(info_file))

                # Add cover image to output folder
                cover_imgs = list(filter(lambda x: re.match(r'(cover\.(jpg|png|jpeg)$)', x, flags=re.I), filenames)) 
                if len(cover_imgs) == 0: # Just add any png/jpg in the folder and call it cover
                    cover_imgs = list(filter(lambda x: re.match(r'(.*\.(jpg|png|jpeg))', x, flags=re.I), filenames)) 
                    # If there are no images in the song at all then use the default
                    if len(cover_imgs) == 0:
                        print("No cover image found in song {}. Filenames: ".format(song_name), filenames)
                        out_zip.write("Zip_Songs_Data/cover.jpg", "cover.jpg")
                    else:
                        out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))
                # Want to just add the first image, dont care if there is more than 1
                else:
                    out_zip.writestr(cover_imgs[0], folder.read(cover_imgs[0]))

                # Add song itself to output zip file
                song_files = list(filter(lambda x: re.match(r'(^.+\.(egg|ogg|mp4|mp3))', x, flags=re.I), filenames))
                if len(song_files) == 0: # Very bad, no song found in folder
                    out_zip.close()
                    raise Exception("No song file found in folder for song: {}. Filenames: {}".format(song_name, filenames))
                for song_file in song_files: # If multiple song files we'll deal with it in processing
                    out_zip.writestr(song_file, folder.read(song_file))
                
                out_filenames = out_zip.namelist()
                if sum('.dat' in f for f in out_filenames) <= 1:
                    out_zip.close()
                    raise Exception("Don't have atleast two dat files for song: {}. Filenames: {}".format(song_name, filenames))

                # removed_files = list(set(filenames) ^ set(out_filenames))
                # special_files = list(filter(lambda x: re.match(r'(?!easy|normal|hard).*', x, flags=re.I), removed_files))
                # if len(special_files) >= 1:
                #     print("Non-difficulty related files removed in song {}. Special files: ".format(song_name), special_files)
                
                return done, 0
        except Exception as e:
            print("Error {} occured when downloading song: {}. Total download count: {}".format(e, song_name, total_download_count))
            # Store the missed download in text file so we can download it later
            with open('failed_download_nums.txt', 'a') as f:
                f.write('{}, '.format(total_download_count))
            # Delete the failed download zip's file
            os.remove("Zip_Songs_Data/{}.zip".format(out_filename))
            return done, 5000
    
    # Timeout. Hit rate limit
    elif request_data.status_code == 429: 
        print("Timeout. Status code:", request_data.status_code, "Timeout len:", request_data.json()['resetAfter'])
        return done, request_data.json()['resetAfter']
    # Some other error
    else:
        done = True
        print("Couldn't get song: {}. Status code: {}. Response: {}".format(song_name, data.status_code, request_data.content))
        return done, 5000

In [182]:
total_download_count = 0

In [183]:
def download_all_songs(maps_df, start_song=0, max_songs=50000):
    print("Starting to download {} songs starting at song {}".format(len(maps_df.download_URL.unique()), start_song))
    global total_download_count
    download_count = 0
    start_time = time.time()
    for download_url in maps_df.download_URL.unique():
        if download_count > max_songs:
            break
        if download_count < start_song:
            continue
        if total_download_count % 50 == 0 and total_download_count != 0:
            print("Downloaded up to song: ", total_download_count)
        if total_download_count % 100 == 0 and total_download_count != 0:
            curr_time = time.time()
            print("Sleeping for 10 seconds to reset timeout timer. Time elasped: {:.2f}".format(curr_time - start_time))
            time.sleep(10)
        # Get the song name using the key found in the download url
        song_name = (maps_df.loc[maps_df['key'] == download_url.rsplit('/', 1)[-1]])['song_name'].values[0]
        done = False
        # Keep attempting to download if it keeps timing out
        while not done:
            done, timeout = download_song(download_url, song_name)
            if timeout:
                # Sleep to reset timeout
                print("Sleeping for {} seconds to reset timeout timer".format((timeout / 1000) + 1))
                time.sleep((timeout / 1000) + 2)

        download_count += 1
        total_download_count += 1
    end_time = time.time()
    print("Time taken to download: {:.2f} seconds".format(end_time - start_time))
    print("Number of songs:", download_count)

In [177]:
# Get maps dataframe back from the pickle file
maps_df = pd.read_pickle("maps_df.pkl")

# Download all the songs which get saved into .zip files in the Zip_Song_Data directory
download_all_songs(maps_df, start_song=total_download_count)

Starting to download 23678 songs starting at song 0
Downloaded up to song:  50
Downloaded up to song:  100
Sleeping for 10 seconds to reset timeout timer. Time elasped: 106.29
Downloaded up to song:  150
Downloaded up to song:  200
Sleeping for 10 seconds to reset timeout timer. Time elasped: 221.22
Downloaded up to song:  250
Downloaded up to song:  300
Sleeping for 10 seconds to reset timeout timer. Time elasped: 339.90
Don't have atleast two dat files for song: [Extra Sensory] Mick Gordon - At DOOM's Gate. Filenames:  ['arti.jpg', 'bird.png', 'cover.jpg', 'cyan.png', 'hell.dat', 'reaxt.png', 'hell.egg', 'Info.dat']
Error 'ZipFile' object has no attribute 'name' occured when downloading song: [Extra Sensory] Mick Gordon - At DOOM's Gate. Total download count: 339
Sleeping for 6.0 seconds to reset timeout timer
Downloaded up to song:  350
Downloaded up to song:  400
Sleeping for 10 seconds to reset timeout timer. Time elasped: 457.25


KeyboardInterrupt: 

In [91]:
test_dir = "D:\Oculus\Games\Software\hyperbolic-magnetism-beat-saber\Beat Saber_Data\CustomLevels"
names = {''}
for subdir, dirs, files in os.walk(test_dir):
    for file in files:
        if not file.endswith(".egg") and not file.endswith(".jpg"):
            names.add(file.rsplit('.', 1)[-1])

print(names)

{'', 'json', 'png', 'jpeg', 'dat', 'mp3', 'xmp', 'PNG', 'mp4', 'ogg', 'exe', 'sfk'}
