In [46]:
import sys
import os
import numpy as np
import pandas as pd
import json
import pickle
import requests
import time

In [72]:
# Our model is for L33T gamers only so we will remove all songs that are not expert or expert+
def remove_below_expert_songs(maps_data):
    del_count = 0
    for i in range(len(maps_data) - 1, -1, -1):
        song_map = maps_data[i]
        song_difficulties = song_map['metadata']['difficulties']
        if song_difficulties['expert'] != True and song_difficulties['expertPlus'] != True:
            del maps_data[i]
            del_count += 1
    print("Removed {} maps which didn't have expert or expert+ difficulty".format(del_count))

In [73]:
# Remove lowly rated songs to have our model output ones which are considered higher quality
def remove_low_rated_songs(maps_data, min_rating=0.6):
    del_count = 0
    for i in range(len(maps_data) - 1, -1, -1):
        song_map = maps_data[i]
        song_stats = song_map['stats']
        rating = song_stats['rating']
        if rating < min_rating:
            del maps_data[i]
            del_count += 1
        #     print("Upvotes: {}. Downvotes: {}. Rating: {}".format(song_stats['upVotes'], song_stats['downVotes'], song_stats['rating']))
    print("Removed {} maps which didn't reach a rating of atleast {}".format(del_count, min_rating))

In [74]:
# We want to remove maps that are only for the non-standard game modes (such as one saber mode or darth maul mode)
def remove_non_standard_songs(maps_data):
    song_del_count = 0
    for i in range(len(maps_data) - 1, -1, -1):
        song_map = maps_data[i]
        characteristics = song_map['metadata']['characteristics']

        # Remove non-standard modes from the table
        del_indicies = [] # Indicies of the non-standard mode versions
        for j, mode_type in enumerate(characteristics):
            if not mode_type['name'] == "Standard":
                del_indicies.append(j)

        if len(del_indicies) != 0:
            # Doing this to avoid errors in the case where there is more than one non-standard mode
            # and it deleted the wrong element because the j index is off on the second delete 
            characteristics = np.delete(characteristics, del_indicies).tolist()
            
            if len(characteristics) not in [0, 1]:
                print("Length not in 0 or 1. Length:", len(characteristics), "i:", i)
                for name in characteristics:
                    print(name['name'])
        
        # If there are no more types then there is no standard mode version so delete
        if len(characteristics) == 0:
            del maps_data[i]
            song_del_count += 1
        
        else:
            assert (characteristics[0])['name'] == "Standard"

        song_map['metadata']['characteristics'] = characteristics

    print("Removed {} maps which didn't have a standard mode version".format(song_del_count))

In [75]:
# Most short songs (below 30 sec) are usually "meme" maps with weird patterns which we want to ignore, such as the fortnite dance map
# Similarly, most long songs (over 10 minutes) are also "meme" maps, such as the entire Shrek movie map  
def remove_short_or_long_songs(maps_data, duration_lower_bound=20, duration_upper_bound=600):
    short_del_count, long_del_count = 0, 0
    for i in range(len(maps_data) - 1, -1, -1):
        song_map = maps_data[i]
        if len(song_map['metadata']['characteristics']) != 0:
            characteristics = song_map['metadata']['characteristics'][0]
            # If remove_non_standard_songs is working this should never happen
            if characteristics['name'] != "Standard":
                print("Non-standard mode found. i:", i, "mode name:", characteristics['name'])
            for difficulty, data in characteristics['difficulties'].items():
                # Check if this map has a version for this difficulty
                if data != None:
                    # We only need to check one difficulty as the song length is the same accross difficulties
                    if data['length'] < duration_lower_bound:
                        del maps_data[i]
                        short_del_count += 1
                    elif data['length'] > duration_upper_bound:
                        del maps_data[i]
                        long_del_count += 1
                    break
    
    print("Removed {} maps which were too short and {} maps which were too long".format(short_del_count, long_del_count))

In [81]:
# Extract the important info for each map from the data so we can analyze it and easily remove outliers
def convert_to_pandas(maps_data, lower_nps_quantile=0.03, upper_nps_quantile=0.995, upper_bombs_quantile=0.97):
    # Extract the important values for each map into a dictonary stored in a list for pandas
    list_of_vals = []
    for i, song_map in enumerate(maps_data):
        characteristics = song_map['metadata']['characteristics'][0]
        for difficulty, data in characteristics['difficulties'].items():
            # Check if this map has a version for this difficulty
            if data != None:
                if difficulty == "expert" or difficulty == "expertPlus":
                    # print(data['bpm'])
                    list_of_vals.append({'song_name' : song_map['name'],
                                         'key' : song_map['key'],
                                         'download_URL' : song_map['downloadURL'],
                                         'difficulty' : difficulty,
                                         'rating' : song_map['stats']['rating'],
                                         'notes' : data['notes'],
                                         'bombs' : data['bombs'],
                                         'length' : data['length'],
                                         'njs' : data['njs']
                                         })
    
    # Use the list to make the pandas dataframe
    maps_df = pd.DataFrame(list_of_vals)

    # Use the dataframe to remove invalid maps (for some reason there are a few maps with no notes)
    zero_val_row = pd.concat([maps_df[cols] == 0 for cols in ['notes', 'length']], axis=1).any(axis=1)
    maps_df = maps_df[~zero_val_row]
    print("Removed {} maps which had values set at either zero notes or zero length".format(zero_val_row.value_counts().loc[True]))
    # Make sure none of our songs are invalid
    assert ((maps_df['notes'].values == 0).sum()) == 0
    assert ((maps_df['length'].values == 0).sum()) == 0

    # Use the dataframe to remove outliers in terms of number of notes per second
    # We use notes per second because our song duration varies greatly so a short song may notes in the 1st quantile but still be good 
    maps_df['nps'] = maps_df['notes'] / maps_df['length']
    lower_nps_cutoff = maps_df['nps'].quantile(lower_nps_quantile) # Should remove most 'light show'/'wall art' maps that weren't tagged or exetremely easy maps (our model is for l33t gamers only)
    upper_nps_cutoff = maps_df['nps'].quantile(upper_nps_quantile) # We want our model to make hard songs, just not exetremely hard ones
    # print("Lower notes per second cutoff: {}. Upper notes per second cutoff: {}.".format(lower_nps_cutoff, upper_nps_cutoff))
    
    # Use the dataframe to remove upper outliers in number of bombs
    # We do this to remove the 'dance' maps which use mostly bombs and walls to force the player to move in a certain way 
    upper_bombs_cutoff = maps_df['bombs'].quantile(upper_bombs_quantile) # No lower cutoff because we dont care if a map has no bombs
    # print("Upper bombs cutoff: {}.".format(upper_bombs_cutoff))

    size_before_removing_outliers = len(maps_df)
    maps_df = maps_df[(maps_df['nps'] >= lower_nps_cutoff) 
                     &(maps_df['nps'] <= upper_nps_cutoff)
                     &(maps_df['bombs'] <= upper_bombs_cutoff)]
    print("Removed {} maps which were outliers in terms of number of notes per second or bombs".format(size_before_removing_outliers - len(maps_df)))

    return maps_df

In [82]:
with open("maps_data.json", 'r') as f:
    maps_data = json.load(f)

#======================== Removal Settings ========================#
min_rating = 0.54           # Ratio of upvotes to downvotes
duration_lower_bound = 20   # Seconds
duration_upper_bound = 600  # Seconds
lower_nps_quantile = 0.03   # Quantile
upper_nps_quantile = 0.995  # Quantile
upper_bombs_quantile = 0.97 # Quantile
#==================================================================#

print("Number of songs originally:", len(maps_data))

remove_below_expert_songs(maps_data)
print("Number of songs after removing ones without expert or expert+:", len(maps_data))

remove_low_rated_songs(maps_data, min_rating=min_rating)
print("Number of songs after removing low rated ones:", len(maps_data))

remove_non_standard_songs(maps_data)
print("Number of songs after removing ones without a standard mode version:", len(maps_data))

remove_short_or_long_songs(maps_data, duration_lower_bound=duration_lower_bound, duration_upper_bound=duration_upper_bound)
print("Number of songs after removing ones that are too short or too long:", len(maps_data))

maps_df = convert_to_pandas(maps_data, lower_nps_quantile=lower_nps_quantile, upper_nps_quantile=upper_nps_quantile, upper_bombs_quantile=upper_bombs_quantile)
print("Number of maps (seperated by difficulty) after removing invalid ones and outliers in notes per second or bombs:", len(maps_df))

# Save the dataframe in a pickle file
maps_df.to_pickle("maps_df.pkl")

Number of songs originally: 40875
Removed 6220 maps which didn't have expert or expert+ difficulty
Number of songs after removing ones without expert or expert+: 34655
Removed 8084 maps which didn't reach a rating of atleast 0.54
Number of songs after removing low rated ones: 26571
Removed 101 maps which didn't have a standard mode version
Number of songs after removing ones without a standard mode version: 26470
Removed 1117 maps which were too short and 105 maps which were too long
Number of songs after removing ones that are too short or too long: 25248
Removed 10 maps which had values set at either zero notes or zero length
Removed 1908 maps which were outliers in terms of number of notes per second or bombs
Number of maps (seperated by difficulty) after removing invalid ones and outliers in notes per second or bombs: 28593


In [None]:
print(type(maps_data[463]['metadata']['characteristics']))
print(len(maps_data[463]['metadata']['characteristics']))
print(maps_data[463].keys())
print(maps_data[463]['metadata'].keys())
print(maps_data[463]['metadata']['characteristics'][0]['difficulties'].values())
print(json.dumps(maps_data[33]['metadata']['difficulties'], indent=2))