In [95]:
import sys
import os
import numpy as np
import pandas as pd
import json
import pickle
from pathlib import Path
from zipfile import ZipFile
from collections import OrderedDict
import re
import time
import librosa
import soundfile as sf
import sklearn
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\Map_Processing")
    sys.path.append(module_path+"\\Note_Timing")

# import importlib
# importlib.reload(sys.modules['analyze_notes'])
# importlib.reload(sys.modules['onset_detection'])

from analyze_notes import get_note_placements_by_index 
from onset_detection import get_onset_times

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Get maps dataframe from the pickle file
maps_df = pd.read_pickle("../Data_Gather_Filter_Download/downloaded_maps_df.pkl")

In [4]:
with open('../Note_Orientation/most_common_placements.pkl', 'rb') as f:
    most_common_placements = pickle.load(f)

In [5]:
#==================================== Dataset Settings ====================================#
# Note: We have to use a subset of all our songs as 15k maps would take days to train
total_data_size = 15000 # Number of maps to use in across all datasets
val_split = 0.2         # Percentage of data put into validation set
test_split = 0.05       # Percentage of data put into testing set
#==========================================================================================#

# Split our data into training and test/val which we will split again
train_df, val_test_df = train_test_split(maps_df[:total_data_size], test_size=val_split + test_split)

# Split the validation and testing data apart into their own respective sets
val_df, test_df = train_test_split(val_test_df, test_size=(test_split/(val_split + test_split)))

print("Number of training maps: {}. Numer of validation maps: {}. Number of testing maps: {}".format(len(train_df), len(val_df), len(test_df)))

Number of training maps: 11250. Numer of validation maps: 3000. Number of testing maps: 750


In [99]:
class beatmap_generator(nn.Module):
    def __init__(self, input_size, output_size, seq_size, hidden_size, num_layers=1, dropout=0):
        super(beatmap_generator, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.seq_size = seq_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # LSTM model
        self.net = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        # Converts output back to valid index into most common notes
        self.decoder = nn.Linear(hidden_size * seq_size, output_size)
    
    def init_hidden_layer(self, batch_size):
        self.batch_size = batch_size # Have to set it here
        hidden_init = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
        cell_init = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
        self.hidden = (hidden_init, cell_init)

    def forward(self, data):
        output, self.hidden = self.net(data, self.hidden)   # get the next output and hidden state
        output = output.contiguous().view(data.size(0), -1)
        output = self.decoder(output)                       # predict distribution over next tokens
        output = F.log_softmax(output, dim=1)
        return output

In [91]:
#===================================== Model Settings =====================================#
input_size = 25                             # Number of features in input
output_size = len(most_common_placements)   # Number of possible outputs for model
seq_size = 512                              # Length of note sequences passed into model
hidden_size = 128                           # Number of nodes in hidden layer
num_layers = 2                              # Number of LSTM layers for stacked LSTM
dropout = 0                                 # Probability of droping weights in the dropout layer
#==========================================================================================#

bm_gen = beatmap_generator(input_size, output_size, seq_size, hidden_size, num_layers, dropout)
# Set it to use the GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA available, using GPU")
    bm_gen = bm_gen.cuda()

CUDA available, using GPU


In [92]:
# Adapted from tutorial 5 and 6 code and tutorial 3 and 4
def train_network(model, train_df, val_df, num_epochs=5, learning_rate=learning_rate, batch_size=16):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    losses, train_acc, valid_acc = np.zeros(num_epochs), np.zeros(num_epochs), np.zeros(num_epochs)
    num_songs = 0
    epochs = []
    for epoch in range(num_epochs):
        for beatmap in train_df.itertuples():
            start_time = time.time()
            diff = beatmap.difficulty
            diff = diff[:1].upper() + diff[1:] # Capitialize the first character only
            if 'extensions' in beatmap.requirements:
                continue # Catches a few maps that shouldn't have made it here
            # See if the features have already been calculated
            try:
                folder_name = beatmap.file_path.rsplit('/', 1)[-1]
                folder_name = folder_name.split('.')[0]
                folder_p = Path('Saved_Features/v{}_{}/'.format(features_version, folder_name))
                file_p = folder_p / '{}.npy'.format(diff)
                print(file_p)
                features_dict = np.load(file_p, allow_pickle=True)
                print("Successfully opened file for {} on {} difficulty".format(beatmap.song_name, diff))
                features = features_dict.item().get('features')
                padded_placements = features_dict.item().get('placements')
            
            except Exception as e: # If they haven't been then calculate them now
                print("Calculating features and placements for {} on {} difficulty. Exception: {}".format(beatmap.song_name, diff, e))
                with ZipFile('../Data_Gather_Filter_Download/{}'.format(beatmap.file_path)) as folder:
                    filenames = folder.namelist()
                    with folder.open('{}.dat'.format(diff)) as dat_file:
                        dat_json = json.load(dat_file)
                        placements = get_note_placements_by_index(dat_json, most_common_placements)
                    song_path = list(filter(lambda x: re.match(r'(^.+\.(egg|ogg|mp4|mp3))', x, flags=re.I), filenames))[0]
                    folder.extract(song_path)
                    features, padded_placements = get_features(song_path, bpm=beatmap.bpm, placements=placements, 
                                                               save_data=True, difficulty=diff, folder_path=beatmap.file_path)
                    os.remove(song_path)
            # Create sequence arrays for different permutations
            feature_sequences, placements_for_seq = split_into_sequences(features, padded_placements, 512)
            feature_tensor = torch.from_numpy(feature_sequences).float()
            placements_for_seq_tensor = torch.from_numpy(placements_for_seq).long()
            # Create the data loader we will use
            train_loader = torch.utils.data.DataLoader(feature_tensor, batch_size=batch_size, num_workers=num_workers)
            label_loader = torch.utils.data.DataLoader(placements_for_seq_tensor, batch_size=batch_size, num_workers=num_workers)

            
            print("Looping over {} batches".format(len(train_loader)))
            for feature_batch, target_batch in zip(train_loader, label_loader):
                if torch.cuda.is_available():
                    feature_batch = feature_batch.cuda()
                    target_batch = target_batch.cuda()
                model.init_hidden_layer(feature_batch.size(0))
                output = model(feature_batch)
                # print('output shape: {}'.format(output.shape))
                # output = output.view(output.size(0) * output.size(1), output.size(2))
                # print('output shape: {}'.format(output.shape))
                # print('target shape: {}'.format(target_batch.shape))
                # print('Max in target {}, min in target {}'.format(torch.argmax(target_batch), torch.argmin(target_batch)))
                loss = criterion(output, target_batch)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            end_time = time.time()
            print("Time taken for song: {:.2f} seconds".format(end_time - start_time))
        losses[epoch] = float(loss)    
        epochs.append(epoch)
        train_acc[epoch] = get_accuracy(model, train_iter)
        valid_acc[epoch] = get_accuracy(model, valid_iter)
        print("Epoch %d; Loss %f; Train Acc %f; Val Acc %f" % (
            epoch+1, loss, train_acc[epoch], valid_acc[epoch]))

    # plotting
    plt.title("Training Curve")
    plt.plot(losses, label="Train")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(epochs, train_acc, label="Train")
    plt.plot(epochs, valid_acc, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

In [96]:
#==================================== Training Settings ===================================#
learning_rate = 0.004   # Learning rate
num_epochs = 5          # Number of epochs
batch_size = 32         # Number of sequences to batch together
num_workers = 1         # Number of workers to load the data
features_version = 1    # If I make any large changes that require re-calculating features
#==========================================================================================#


train_network(bm_gen, train_df, val_df, num_epochs=1)

ory: 'Saved_Features\\v1_(23d7)_Red_Hot_Chili_Peppers_-_Otherside\\Expert.npy'
Looping over 483 batches
Time taken for song: 53.60 seconds
Saved_Features\v1_(50ec)_Jump_Training_-_V1\ExpertPlus.npy
Calculating features and placements for Jump Training - V1 on ExpertPlus difficulty. Exception: [Errno 2] No such file or directory: 'Saved_Features\\v1_(50ec)_Jump_Training_-_V1\\ExpertPlus.npy'
Looping over 402 batches
Time taken for song: 50.81 seconds
Saved_Features\v1_(b2f)_Koi_ha_tenshi_no_chaimu_kara_-_Yukari_Tamura\Expert.npy
Calculating features and placements for Koi ha tenshi no chaimu kara - Yukari Tamura on Expert difficulty. Exception: [Errno 2] No such file or directory: 'Saved_Features\\v1_(b2f)_Koi_ha_tenshi_no_chaimu_kara_-_Yukari_Tamura\\Expert.npy'
Looping over 202 batches
Time taken for song: 28.15 seconds
Saved_Features\v1_(b6a2)_DECO27_-_Nocturnal_Kids_ft\ExpertPlus.npy
Calculating features and placements for DECO*27 - Nocturnal Kids [ft. Hatsune Miku] on ExpertPlus di

KeyboardInterrupt: 

In [89]:
def get_features(song_path, bpm, placements=None, save_data=False, difficulty='expertPlus', version=features_version, folder_path=''):
    beat_frames, sr, melspectrogram, chromagram = get_audio_data(song_path, bpm)
    quarter_frames = np.array([]) # Array to store data for every quarter note
    index = 0
    while index < len(beat_frames) - 1:
        quarter_frames = np.append(quarter_frames, np.arange(beat_frames[index], beat_frames[index + 1], 
                                                            round((beat_frames[index + 1] - beat_frames[index]) / 4))[1:4])
        index += 1
    
    beat_frames_with_quarter = np.concatenate((np.array([0]), beat_frames, quarter_frames), axis=0)
    beat_frames_with_quarter.sort() # We appended the quarter notes so we need to sort them into right spots
    beat_frames_with_quarter = beat_frames_with_quarter.astype(int)

    beat_frames_with_mel = librosa.util.sync(melspectrogram, beat_frames_with_quarter, aggregate=np.median)
    beat_frames_with_chroma = librosa.util.sync(chromagram, beat_frames_with_quarter, aggregate=np.median)

    # Want to extend it to 1/16 beats as 1/4 beats limits the max NPS by a lot
    # Using pandas to easily extend to 1/16 beats
    beat_num_with_16th_notes = np.arange(0, len(beat_frames) - 1, 1.0 / 16.0)
    beat_num = np.arange(0, len(beat_frames), 1)
    beats_num_df = pd.concat([pd.Series(beat_num, name='beat_num', dtype=int), pd.Series(beat_frames, name='beat_frame', dtype=int)], axis=1)
    mel_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_mel.T)], axis=1)
    chroma_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_chroma.T)], axis=1)

    # Dataframe with the beat number, frame number, mel data, and chroma data for the beat
    beats_df = beats_num_df.merge(mel_beats_df, how='outer', left_on='beat_frame', right_on='quater_beat_frame', sort=True)

    # Only need the quarter beat frames
    beats_df = beats_df.drop(columns=['beat_frame']) 
    beats_df = beats_df.merge(chroma_beats_df, how='outer', on='quater_beat_frame', sort=True)

    # Removes the NaNs from the beat number column for next merge
    beats_df.interpolate(inplace=True) 

    # Expand it to be 1/16 beats. Doing it in 1/4 beats saves a lot of computation with sync
    beats_num_16th_notes_df = pd.DataFrame(beat_num_with_16th_notes, columns=['beat_num'])
    beats_df = beats_df.merge(beats_num_16th_notes_df, how='outer', on='beat_num', sort=True)

    # Interpolate the quarter beat frame so we can more accuractly place the placements
    beats_df['quater_beat_frame'].interpolate(inplace=True) 
    beats_df['quater_beat_frame'] = beats_df['quater_beat_frame'].round()
    beats_df = beats_df.fillna(method='pad') # Forward fill

    # Add placement column to store what note type is at that time
    beats_df.insert(1, 'placement', 0)
    # Add column for time since last note since this helps dicate what placements should be done
    beats_df.insert(3, 'time_since_last_note', 0.0)

    # Now the computed audio values for each quarter note are spread among 1/16 notes so we can use sequences of 1/16 notes
    for timing, placement in placements.items():
        time_in_frames = librosa.core.time_to_frames(beat_to_time(timing, bpm), sr=sr)
        try:
            matching_frames = beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement']
            if len(matching_frames) == 1:
                beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = placement
            else: # Must be more than one quater beat on that frame
                placement_arr = [placement]
                placement_arr.extend([0] * (len(matching_frames) - 1))
                beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = placement_arr
        except: # No exact note timing match
            # Subtract the value and find the one closest to 0
            closest_index = beats_df['quater_beat_frame'].sub(time_in_frames).abs().idxmin()
            beats_df[closest_index]['placement'] = placement
    # Set the placements with 16 to 0 as they represent the same board but are easier to distinguish
    beats_df.loc[beats_df['placement'] == 16, 'placement'] = 0

    # Extract from pandas into list which we can turn into tensor later
    features = []
    last_note_time = 0
    for i in range(len(beats_df.index)):
        # Set the time since the last note
        time_in_s = librosa.core.frames_to_time(beats_df.at[i, 'quater_beat_frame'], sr=sr)
        if beats_df.at[i, 'placement'] != 0 and beats_df.at[i, 'placement'] != 16:
            last_note_time = time_in_s
        if last_note_time != 0:
            beats_df.at[i, 'time_since_last_note'] = (time_in_s - last_note_time)
        # Extract the data from this row
        features.append(beats_df.iloc[i, 3:].tolist())

    # Convert to numpy arrays for easier slicing later
    features = np.asarray(features)
    # Placements made by the human mapper
    human_placements = beats_df['placement'].to_numpy()

    if save_data:
        feat_dict = {'features' : features, 'placements' : human_placements}
        folder_name = folder_path.rsplit('/', 1)[-1]
        folder_name = folder_name.split('.')[0]
        path = 'Saved_Features/v{}_{}/'.format(version, folder_name)
        if not os.path.exists(path):
            os.makedirs(path)
        np.save(os.path.join(path, '{}.npy'.format(difficulty)), feat_dict)

    return features, human_placements

In [11]:
def get_audio_data(song_path, bpm):
    y, sr = librosa.load(song_path)
    length = y.shape[0] / sr # Song length according to librosa in secs (doesn't match given length for some reason)
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, trim=False, bpm=bpm)
    # Reduce n_mels to avoid empty filters in mel frequency basis and to match the size of chrormagram
    melspectrogram = librosa.feature.melspectrogram(y=y_percussive, sr=sr, n_mels=12, fmax=65.4)
    chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
    return beat_frames, sr, melspectrogram, chromagram

In [12]:
def split_into_sequences(features, placements, seq_len):
    feature_sequences = []
    placements_for_seq = []
    for i in range(len(features)):
        end_index = i + seq_len
        if end_index > len(features):
            break
        feature_seq = features[i:end_index, :] 
        placement_for_seq = placements[i]
        feature_sequences.append(feature_seq)
        placements_for_seq.append(placement_for_seq)
    return np.array(feature_sequences), np.array(placements_for_seq)

In [13]:
def get_accuracy(net, data):
    correct, total = 0, 0
    for sms, labels in data:
        output = net(sms[0])
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += labels.shape[0]
    return correct / total

In [14]:
# Helper functions to do some conversions
# Time in seconds to beat number
def time_to_beat(note_time, bpm):
    return (note_time / 60) * bpm

# Beat number to seconds
def beat_to_time(beat_time, bpm):
    return (beat_time / bpm) * 60

In [137]:
# Returns file path to folder containing all files needed to play song made by model
def get_map_from_song(song_path, model, seq_len=512, output_file_path='Expert.dat', start_time=2, bpm=0):
    # Get the onset times where we will place notes
    onset_times = get_onset_times(song_path, min_sep=0.1)
    num_before = len(onset_times)
    onset_times = np.delete(onset_times, np.where(onset_times <= start_time))
    print("Removed {} onset times for being before the specified start time".format(num_before - len(onset_times)))
    # If the bpm is not provided then we calculate it ourselves
    if bpm == 0:
        y, samp_rate = librosa.load(song_file)
        bpm = librosa.beat.tempo(y=y, sr=samp_rate)
        print("Got a bpm of {}".format(bpm))
    # Determine the notes we should place
    beats_df, sr = generate_placements(song_path, model, bpm, onset_times, seq_len)

    notes_as_json = convert_model_placements_to_valid_json(beats_df, most_common_placements, sr, bpm)
    with open(output_file_path, 'w') as dat_file:
        dat_data = {"_version": "2.2.0",
                    "_customData": {
                        "_time": '',
                        "_BPMChanges": [],
                        "_bookmarks": []
                        },
                    "_events": [],
                    "_notes": notes_as_json,
                    "_obstacles": [],
                    "_waypoints": []
                    }
        json.dump(dat_data, dat_file)
    
    print("Number of notes placed: {}\nNumber of unique note placements: {}\nApprox. notes per second: {}".format(
            len(notes_as_json),
            len(set(notes_list)),
            len(notes_as_json) / np.amax(onset_times)
            )
        )


In [151]:
get_map_from_song('redo.egg', bm_gen, seq_len=seq_size, bpm=190)

Removed 12 onset times for being within 0.1s of the next note
Removed 1 onset times for being before the specified start time
At timing
torch.Size([1, 2001])


TypeError: argmax(): argument 'dim' (position 1) must be int, not Tensor

In [152]:
def generate_placements(song_path, model, bpm, onset_times, seq_len, difficulty='expertPlus', version=features_version):
    beat_frames, sr, melspectrogram, chromagram = get_audio_data(song_path, bpm)
    quarter_frames = np.array([]) # Array to store data for every quarter note
    index = 0
    while index < len(beat_frames) - 1:
        quarter_frames = np.append(quarter_frames, np.arange(beat_frames[index], beat_frames[index + 1], 
                                                            round((beat_frames[index + 1] - beat_frames[index]) / 4))[1:4])
        index += 1
    
    beat_frames_with_quarter = np.concatenate((np.array([0]), beat_frames, quarter_frames), axis=0)
    beat_frames_with_quarter.sort() # We appended the quarter notes so we need to sort them into right spots
    beat_frames_with_quarter = beat_frames_with_quarter.astype(int)

    beat_frames_with_mel = librosa.util.sync(melspectrogram, beat_frames_with_quarter, aggregate=np.median)
    beat_frames_with_chroma = librosa.util.sync(chromagram, beat_frames_with_quarter, aggregate=np.median)

    # Want to extend it to 1/16 beats as 1/4 beats limits the max NPS by a lot
    # Using pandas to "easily" extend to 1/16 beats
    beat_num_with_16th_notes = np.arange(0, len(beat_frames) - 1, 1.0 / 16.0)
    beat_num = np.arange(0, len(beat_frames), 1)
    beats_num_df = pd.concat([pd.Series(beat_num, name='beat_num', dtype=int), pd.Series(beat_frames, name='beat_frame', dtype=int)], axis=1)
    mel_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_mel.T)], axis=1)
    chroma_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_chroma.T)], axis=1)

    # Dataframe with the beat number, frame number, mel data, and chroma data for the beat
    beats_df = beats_num_df.merge(mel_beats_df, how='outer', left_on='beat_frame', right_on='quater_beat_frame', sort=True)

    # Only need the quarter beat frames
    beats_df = beats_df.drop(columns=['beat_frame']) 
    beats_df = beats_df.merge(chroma_beats_df, how='outer', on='quater_beat_frame', sort=True)

    # Removes the NaNs from the beat number column for next merge
    beats_df.interpolate(inplace=True) 

    # Expand it to be 1/16 beats. Doing it in 1/4 beats saves a lot of computation with sync
    beats_num_16th_notes_df = pd.DataFrame(beat_num_with_16th_notes, columns=['beat_num'])
    beats_df = beats_df.merge(beats_num_16th_notes_df, how='outer', on='beat_num', sort=True)

    # Interpolate the quarter beat frame so we can more accuractly place the placements
    beats_df['quater_beat_frame'].interpolate(inplace=True) 
    beats_df['quater_beat_frame'] = beats_df['quater_beat_frame'].round()
    beats_df = beats_df.fillna(method='pad') # Forward fill

    # Add placement column to store what note type is at that time
    beats_df.insert(1, 'placement', 0)
    # Add column for time since last note since this helps dicate what placements should be done
    beats_df.insert(3, 'time_since_last_note', 0.0)
    print("At timing")
    for timing in onset_times:
        time_in_frames = librosa.core.time_to_frames(beat_to_time(timing, bpm), sr=sr)
        try:
            matching_frames = beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement']
            if len(matching_frames) == 1:
                beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = 9999
            else: # Must be more than one quater beat on that frame
                placement_arr = [9999]
                placement_arr.extend([0] * (len(matching_frames) - 1))
                beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = placement_arr
        except: # No exact note timing match
            # Subtract the value and find the one closest to 0
            closest_index = beats_df['quater_beat_frame'].sub(time_in_frames).abs().idxmin()
            beats_df[closest_index]['placement'] = 9999            
    
    last_note_time = 0
    for i in range(len(beats_df.index)):
        # Set the time since the last note
        time_in_s = librosa.core.frames_to_time(beats_df.at[i, 'quater_beat_frame'], sr=sr)
        if beats_df.at[i, 'placement'] != 0 and beats_df.at[i, 'placement'] != 16:
            last_note_time = time_in_s
        if last_note_time != 0:
            beats_df.at[i, 'time_since_last_note'] = (time_in_s - last_note_time)
    
    model.init_hidden_layer(batch_size=1)
    placement_indicies = list(np.where(beats_df['placement'] == 9999)[0])
    for index in placement_indicies:
        seq = []
        start_index = index - seq_len
        # print(type(index))
        # print(type(placement_indicies))
        # print(type(start_index))
        # print(start_index)
        if start_index <= 0:
            num_padding = abs(start_index)
            seq.extend([[0] * len(beats_df.iloc[i, 3:].tolist())] * num_padding)
            start_index = 0
        relev_df = beats_df.iloc[start_index:index, 3:]
        for ind, d in relev_df.iterrows():
            seq.append(d.tolist())
        # print(seq)
        seq = np.array(seq)
        feature_tensor = torch.from_numpy(seq).float()
        if torch.cuda.is_available():
            feature_tensor = feature_tensor.cuda()
        output = model(feature_tensor.unsqueeze(0))
        print(output.shape)
        chosen_placement = torch.argmax(output, 1)
        beats_df.at[index, 'placement'] = chosen_placement

    return beats_df, sr

In [130]:
def convert_model_placements_to_valid_json(beats_df, most_common_placements, sr, bpm):
    list_of_jsons = []
    for beat in beats_df.loc[beats_df['placement'] != 0].iterrows():
        placement = beat['placement']
        time_in_beat = time_to_beat(librosa.core.frames_to_time(beat['quater_beat_frame'], sr=sr), bpm)
        placement_info = most_common_placements[placement]
        for i in range(len(placement_info)):
            if placement_info[i] != 0:
                val = placement_info[i]
                # 0 - Red, 1 - Blue 
                colour = 0 if val < 10 else 1
                note_dir = val - (colour * 9) - 1
                col = i % 4
                row = (i - col) / 4
                note_json = {"_time": time_in_beat,
                            "_lineIndex": col,
                            "_lineLayer": row,
                            "_type": colour,
                            "_cutDirection": note_dir}
                list_of_jsons.append(note_json)
    return list_of_jsons

In [101]:
print(most_common_placements[1])

(0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0)


In [16]:
# class Dict_Dataset(torch.utils.data.Dataset):
#     def __init__(self, features, targets):
#         self.targets = targets
#         self.features = features

#     def __getitem__(self, index):
#         print('index : {}'.format(index))
#         if index >= self.__len__():
#             raise IndexError
#         feature = self.features[index]
#         target = self.targets[index]
#         print('Index: {}. Got feature and target {}'.format(index, self.targets[index]))
#         return {'feature' : feature, 'target' : target}
    
#     def __len__(self):
#         return len(self.targets)

In [442]:

feature_batch = feature_sequences[0:512,:,:]
features_tensor = torch.tensor(feature_batch, dtype=torch.float32)
if torch.cuda.is_available():
    features_tensor = features_tensor.cuda()
print(features_tensor.size())

torch.Size([512, 512, 25])


In [444]:
%%time
bm_gen.init_hidden_layer(features_tensor.size(0))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(bm_gen.parameters(), lr=0.01)
output = bm_gen(features_tensor)

RuntimeError: CUDA out of memory. Tried to allocate 1.96 GiB (GPU 0; 8.00 GiB total capacity; 5.19 GiB already allocated; 873.07 MiB free; 5.33 GiB reserved in total by PyTorch)

In [None]:
# Test our model
song_file_name = "(706a)_Redo_(TV_Size)_ReZero_Opening_-_Konomi_Suzuki"
# (7067)_Sorairo_Days_(TV_Size)_Gurren_Lagann_Opening_-_Shoko_Nakagawa
song_info = maps_df.loc[maps_df['key'] == '706a']
# print(song_info)
bpm = (song_info['bpm'].values)[0]
print(bpm)

with ZipFile('../Data_Gather_Filter_Download/Zip_Songs_Data/{}.zip'.format(song_file_name)) as folder:
    with folder.open('ExpertPlus.dat') as dat_file:
        dat_json = json.load(dat_file)
        placements = get_note_placements_by_index(dat_json, most_common_placements)
        # print(placements)
    folder.extract('song.egg')

    # get_map_from_song('song.egg', start_time=0, bpm=190)
    # os.remove('song.egg')

In [None]:
y, sr = librosa.load('song.egg')
length = y.shape[0] / sr # Song length according to librosa in secs (doesn't match given length for some reason)
y_harmonic, y_percussive = librosa.effects.hpss(y)
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, trim=False, bpm=bpm)
# We are analyzing the song in time points of 16th beats
total_num_beats = time_to_beat(length, bpm) # Length in terms of seconds
length_one_beat = beat_to_time(1, bpm)
time_points = np.arange(0, length, length_one_beat / 16) # 16th beats

# print(total_num_beats)
# print(length_one_beat)
# print(length_one_beat / 16)
# print(tempo, len(y), len(beat_frames), "\n", beat_frames)

In [None]:
# Reduce n_mels to avoid empty filters in mel frequency basis and to match the size of chrormagram
melspectrogram = librosa.feature.melspectrogram(y=y_percussive, sr=sr, n_mels=12, fmax=65.4)
chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
# mfcc = get_mfcc('song.egg', beat_to_time(next_item[0], bpm), beat_to_time(1, bpm), n_mfcc=1)

# print(len(mfcc))
# print(len(melspectrogram))
# print(melspectrogram[4])
# print(len(chromagram))

In [310]:
%%time
# Need to do it this way as some songs start with no noise so we can't naively just assuming that beat 0 is at time 0
quarter_frames = np.array([])
index = 0
while index < len(beat_frames) - 1:
    quarter_frames = np.append(quarter_frames, np.arange(beat_frames[index], beat_frames[index + 1], 
                                                         round((beat_frames[index + 1] - beat_frames[index]) / 4))[1:4])
    index += 1
    # if index < 2:
    #     print(beat_frames[index])
    #     print((beat_frames[index + 1] - beat_frames[index])/4)
    #     print(np.arange(beat_frames[index], beat_frames[index+1], round((beat_frames[index+1]-beat_frames[index])/4))[1:4])

beat_frames_with_quarter = np.concatenate((np.array([0]), beat_frames, quarter_frames), axis=0)
beat_frames_with_quarter.sort() # We appended the quarter notes so we need to sort them into right spots
beat_frames_with_quarter = beat_frames_with_quarter.astype(int)

beat_frames_with_mel = librosa.util.sync(melspectrogram, beat_frames_with_quarter, aggregate=np.median)
beat_frames_with_chroma = librosa.util.sync(chromagram, beat_frames_with_quarter, aggregate=np.median)

# Want to extend it to 1/16 beats as 1/4 beats limits the max NPS by a lot
# Using pandas to easily extend to 1/16 beats
beat_num_with_16th_notes = np.arange(0, len(beat_frames) - 1, 1.0 / 16.0)
beat_num = np.arange(0, len(beat_frames), 1)
beats_num_df = pd.concat([pd.Series(beat_num, name='beat_num', dtype=int), pd.Series(beat_frames, name='beat_frame', dtype=int)], axis=1)
mel_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_mel.T)], axis=1)
chroma_beats_df = pd.concat([pd.Series(beat_frames_with_quarter, name='quater_beat_frame'), pd.DataFrame(beat_frames_with_chroma.T)], axis=1)
# Dataframe with the beat number, frame number, mel data, and chroma data for the beat
beats_df = beats_num_df.merge(mel_beats_df, how='outer', left_on='beat_frame', right_on='quater_beat_frame', sort=True)
# Only need the quarter beat frames
beats_df = beats_df.drop(columns=['beat_frame']) 
beats_df = beats_df.merge(chroma_beats_df, how='outer', on='quater_beat_frame', sort=True)
# Removes the NaNs from the beat number column for next merge
beats_df.interpolate(inplace=True) 
# Expand it to be 1/16 beats. Doing it in 1/4 beats saves a lot of computation with sync
beats_num_16th_notes_df = pd.DataFrame(beat_num_with_16th_notes, columns=['beat_num'])
beats_df = beats_df.merge(beats_num_16th_notes_df, how='outer', on='beat_num', sort=True)
# Interpolate the quarter beat frame so we can more accuractly place the placements
beats_df['quater_beat_frame'].interpolate(inplace=True) 
beats_df['quater_beat_frame'] = beats_df['quater_beat_frame'].round()
beats_df = beats_df.fillna(method='pad') # Forward fill
# Add placement column to store what note type is at that time
beats_df.insert(1, 'placement', 0)
# Add column for time since last note since this helps dicate what placements should be done
beats_df.insert(3, 'time_since_last_note', 0.0)
# print(beats_df.iloc[0:40, 0:3])
# Now the computed audio values for each quarter note are spread among 1/16 notes so we can use sequences of 1/16 notes
for timing, placement in placements.items():
    time_in_frames = librosa.core.time_to_frames(beat_to_time(timing, bpm), sr=sr)
    # print(timing, placement)
    # print(time_in_frames)
    try:
        matching_frames = beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement']
        if len(matching_frames) == 1:
            beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = placement
        else:
            placement_arr = [placement]
            placement_arr.extend([0] * (len(matching_frames) - 1))
            beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'] = placement_arr
        # print(beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement'])
        # print(len(beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement']))
        # print(type(beats_df.loc[beats_df['quater_beat_frame'] == time_in_frames, 'placement']))
        # print('\n')
    except: # No exact note timing match
        print("No exact match found")
        # Subtract the value and find the one closest to 0
        closest_index = beats_df['quater_beat_frame'].sub(time_in_frames).abs().idxmin()
        beats_df[closest_index]['placement'] = placement

beats_df.loc[beats_df['placement'] == 16, 'placement'] = 0
last_note_time = 0
features = []
for i in range(len(beats_df.index)):
    # Set the time since the last note
    time_in_s = librosa.core.frames_to_time(beats_df.at[i, 'quater_beat_frame'], sr=sr)
    if beats_df.at[i, 'placement'] != 0 and beats_df.at[i, 'placement'] != 16:
        last_note_time = time_in_s
        # print(last_note_time)
    if last_note_time != 0:
        beats_df.at[i, 'time_since_last_note'] = (time_in_s - last_note_time)
    # Extract the data from this row
    features.append(beats_df.iloc[i, 3:].tolist())

features = np.asarray(features)
human_placements = beats_df['placement'].to_numpy()

# output = np.concatenate((beat_frames_with_mel, beat_frames_with_chroma), axis=0)

# print(len(features))
# print(len(beats_df))
# print(len(human_placements))
# print(human_placements[:50])
# print(features[0:2])
# print(features)
# print(beats_df.iloc[0:40, 0:3])
# print(beats_df.dtypes)
# print(librosa.time_to_frames(length))
# print(len(beat_frames_with_quarter))    # Do match
# print(len(output[9]))                   # Do match
# print(len(beat_frames))      
# print(len(beat_frames_with_chroma))
# print(beat_frames_with_quarter[:18])
# print(beat_frames[:10])
# print(output[10][:16])

Wall time: 1.64 s


In [None]:
frame_points = librosa.core.time_to_frames(time_points, sr=sr)
beat_frames_with_mel = librosa.util.sync(melspectrogram, frame_points, aggregate=np.median)
beat_frames_with_chroma = librosa.util.sync(chromagram, frame_points, aggregate=np.median)
output = np.concatenate((beat_frames_with_mel, beat_frames_with_chroma), axis=0)
# print(len(beat_frames_with_mel))
# print(len(beat_frames_with_chroma))
print(len(frame_points))    # DONT MATCH????
print(len(output[9]))       # DONT MATCH????
print(output[10][:16])

In [353]:
feature_sequences, placements_for_seq = split_into_sequences(features, human_placements, 512)

In [465]:
print(len(features))
print(len(features[0]))
print(len(feature_sequences))
print(len(feature_sequences[0]))
print(len(feature_sequences[0][0]))
print(len(placements_for_seq))
print(feature_sequences[0])

4530
25
4019
512
25
4019
[[0.00000000e+00 0.00000000e+00 2.30849841e-23 ... 8.53028059e-01
  8.19255352e-01 8.36956620e-01]
 [0.00000000e+00 0.00000000e+00 2.30849841e-23 ... 8.53028059e-01
  8.19255352e-01 8.36956620e-01]
 [0.00000000e+00 0.00000000e+00 2.30849841e-23 ... 8.53028059e-01
  8.19255352e-01 8.36956620e-01]
 ...
 [4.64399093e-02 0.00000000e+00 2.60810107e-01 ... 9.08461094e-01
  8.72298896e-01 8.52924228e-01]
 [0.00000000e+00 0.00000000e+00 2.60810107e-01 ... 9.08461094e-01
  8.72298896e-01 8.52924228e-01]
 [2.32199546e-02 0.00000000e+00 2.60810107e-01 ... 9.08461094e-01
  8.72298896e-01 8.52924228e-01]]


In [355]:
print(feature_sequences.shape)

(4019, 512, 25)


In [454]:
with open('Saved_Features/test/features.txt', 'w') as f:
    f.write("Test")

FileNotFoundError: [Errno 2] No such file or directory: 'Saved_Features/test/features.txt'

In [455]:
np.save('test.npy', feature_sequences)

In [473]:
test_dict = OrderedDict({'features' : feature_sequences, 'target' : placements_for_seq})


OrderedDict([('features', array([[[ 0.00000000e+00,  0.00000000e+00,  2.30849841e-23, ...,
          8.53028059e-01,  8.19255352e-01,  8.36956620e-01],
        [ 0.00000000e+00,  0.00000000e+00,  2.30849841e-23, ...,
          8.53028059e-01,  8.19255352e-01,  8.36956620e-01],
        [ 0.00000000e+00,  0.00000000e+00,  2.30849841e-23, ...,
          8.53028059e-01,  8.19255352e-01,  8.36956620e-01],
        ...,
        [ 4.64399093e-02,  0.00000000e+00,  2.60810107e-01, ...,
          9.08461094e-01,  8.72298896e-01,  8.52924228e-01],
        [ 0.00000000e+00,  0.00000000e+00,  2.60810107e-01, ...,
          9.08461094e-01,  8.72298896e-01,  8.52924228e-01],
        [ 2.32199546e-02,  0.00000000e+00,  2.60810107e-01, ...,
          9.08461094e-01,  8.72298896e-01,  8.52924228e-01]],

       [[ 0.00000000e+00,  0.00000000e+00,  2.30849841e-23, ...,
          8.53028059e-01,  8.19255352e-01,  8.36956620e-01],
        [ 0.00000000e+00,  0.00000000e+00,  2.30849841e-23, ...,
          8.

In [None]:
seq = [[0] * 25 ]
print(seq)