In [5]:
import pandas as pd
import numpy as np
import pickle
import markovify
import os

os.chdir("../src/")

from HMM_modeling import HMM, make_sequence, train_HMM

os.chdir("../notebooks/")

# Hidden Markov Model Development

This notebook can be used to train the Hidden Markov Models (HMM) for BeatMapSynth. If you do not wish to train them yourself, you can download them [here](https://drive.google.com/open?id=1p7j0sENy0DzcMHd3iQ_LQw14j6OcKLsY), unzip the folder, and place them in the `models/` directory.

There are two sets of models based on data explained in the Data Acquisition and Processing notebook. When I developed these, I trained on the smaller set first, and then downloaded the larger, second dataset (which overlaps with the first). 

__If you have downloaded the data for the first dataset following the instructions from the Data Acquisition and Processing notebook and came here to follow the instructions to train the first model, run the following code.__

In [None]:
#Run this code *only* if you downloaded and processed the first dataset using the code in the notebook, *NOT* if
#you downloaded the data with the download link!
difficulties = ['easy', 'normal', 'hard', 'expert', 'expertPlus']
for difficulty in difficulties:
    MC = HMM(difficulty)
    with open(f"../models/HMM_{difficulty}.pkl", 'wb') as f:
        pickle.dump(MC, f)

__If you downloaded the processed data with the link provided, follow these instructions instead:__

For you to train the first version independently, we'll have to subset the dataset first and generate the corpus from that for training the model. The out-of-the-box functions I made won't work in this case.

In [7]:
#Open the metadata file with all of the download keys
with open('../data/metadata.pkl', 'rb') as f:
    metadata_total = pickle.load(f)
#Subset the metadata to only maps with > 90% ratings
first_set = list(filter(lambda x: x['stats']['rating'] >= .9, metadata_total))
#Extract the download keys from the metadata
first_set_keys = []
for x in first_set:
    first_set_keys.append(x['key'])
#Make a list of files that are in the top 90%    
first_set_filenames = []
filelist = [f for f in os.listdir('../data/processed_data/')]
for f in filelist:
    split = f.split('_')
    if split[0] in first_set_keys:
        first_set_filenames.append(f)

In [None]:
#Now we can train the Hidden Markov Model for each difficulty level and save:
difficulties = ['easy', 'normal', 'hard', 'expert', 'expertPlus']
for difficulty in difficulties:
    corpus = []
    for f in first_set_filenames:
        if f.endswith(f"{difficulty}.pkl"):
            with open(f"../data/processed_data/{f}", 'rb') as d:
                df = pickle.load(d)
            seq = make_sequence(df)
            corpus.append(seq)
    MC = train_HMM(corpus, 5)
    with open(f"../models/HMM_{difficulty}.pkl", 'wb') as f:
        pickle.dump(MC, f)

Ok, whew, made it through the dealing with the "version 1" model. Now for the easy part! To train the second version of the HMM on the full dataset of maps with >70% rating, just run the following code:

In [None]:
difficulties = ['easy', 'normal', 'hard', 'expert', 'expertPlus']
for difficulty in difficulties:
    MC = HMM(difficulty)
    with open(f"../models/HMM_{difficulty}_v2.pkl", 'wb') as f:
        pickle.dump(MC, f)