In [1]:
import sys
import os
import numpy as np
import pandas as pd
import json
import requests
import time

In [35]:
def get_maps_data(start_page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
               "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    maps_data = []
    page_num = start_page
    is_done = False
    # Download 50 pages at a time to prevent the server timing out
    while page_num < start_page + 50:
        page_string = "https://beatsaver.com/api/maps/rating/{}".format(page_num)
        data = requests.get(page_string, headers=headers)
        # Successfully got data
        if data.status_code == 200:
            # Process data
            data_json = data.json()
            if page_num == 0:
                print("Total number of pages:", data_json['lastPage'])
                print("Total number of songs:", data_json['totalDocs'])
            if page_num % 20 == 0:
                print("Completed up to page", page_num)
            # print(json.dumps(data_json, indent=2))
            # print(data_json['docs'][0]['name'])
            # print(len(data_json['docs']))
            for doc in data_json['docs']:
                maps_data.append(doc)

            page_num = data_json['nextPage']
            if page_num + 1 > data_json['lastPage']:
                is_done = True
                return page_num, maps_data, is_done, 0
        # Timeout. Hit rate limit
        elif data.status_code == 429: 
            print("Timeout at page", page_num, "Num songs this time:", len(maps_data), "Status code:", data.status_code, "Timeout len:", data.json()['resetAfter'])
            return page_num, maps_data, is_done, data.json()['resetAfter']
        # Error getting page
        else:
            print("Couldn't get page", page_num, "Status code:", data.status_code, "\nResponse:", data.content)
            return page_num, maps_data, is_done, 0
        
    print("Completed 50 pages, now at page:", page_num)
    return page_num, maps_data, is_done, 10000

In [36]:
# Call with max_pages at a lower value if you only want a subset of the data (multiple of 50 pages)
def get_all_maps_data(max_pages=40000):
    curr_page = 0
    all_maps_data = []
    is_done = False
    start_time = time.time()
    while not is_done:
        curr_page, maps_data, is_done, timeout = get_maps_data(curr_page)
        all_maps_data.extend(maps_data)
        if curr_page >= max_pages:
            is_done = True
        elif timeout:
            # Sleep to reset timeout
            print("Sleeping for {} seconds to reset timeout timer".format((timeout / 1000) + 1))
            time.sleep((timeout / 1000) + 2) 

    end_time = time.time()
    print("Time taken to download: {:.2f} seconds".format(end_time - start_time))
    print("Number of songs:", len(all_maps_data))
    print("Size of data:", sys.getsizeof(all_maps_data))
    return all_maps_data

In [37]:
all_maps_data = get_all_maps_data()
with open("maps_data.json", 'w') as f:
    json.dump(all_maps_data, f)

Total number of pages: 1635
Total number of songs: 40891
Completed up to page 0
Completed up to page 20
Completed up to page 40
Completed 50 pages, now at page: 50
Sleeping for 11.0 seconds to reset timeout timer
Timeout at page 60 Num songs this time: 250 Status code: 429 Timeout len: 195
Sleeping for 1.195 seconds to reset timeout timer
Completed up to page 60
Completed up to page 80
Completed up to page 100
Completed 50 pages, now at page: 110
Sleeping for 11.0 seconds to reset timeout timer
Timeout at page 120 Num songs this time: 250 Status code: 429 Timeout len: 374
Sleeping for 1.374 seconds to reset timeout timer
Completed up to page 120
Completed up to page 140
Completed up to page 160
Completed 50 pages, now at page: 170
Sleeping for 11.0 seconds to reset timeout timer
Timeout at page 180 Num songs this time: 250 Status code: 429 Timeout len: 175
Sleeping for 1.175 seconds to reset timeout timer
Completed up to page 180
Completed up to page 200
Completed up to page 220
Comple