In [1]:
!pip3 install dynamax

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from sklearn.model_selection import train_test_split
from dynamax.hidden_markov_model import CategoricalHMM
from sklearn.neighbors import NearestCentroid
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import jax.numpy as jnp
import jax.random as jr
from jax import vmap
from matplotlib import pyplot as plt
import os
import pickle



In [3]:
def csv_to_numpy(composer,song):
  main_dir = "/content/Composers"
  df = pd.read_csv(main_dir + '/' + composer + '-output-csvs' + '/' + song)
  return df[['note_num','normed_duration']].to_numpy()

In [13]:
def load_dictionaries(files = ['train.pkl','test.pkl']):
  train_file = files[0]
  test_file = files[1]
  print(train_file)
  with open(train_file,'rb') as f:
    train_dict = pickle.load(f)
  with open(test_file,'rb') as f:
    test_dict = pickle.load(f)

  return train_dict, test_dict

In [14]:
train_dict, test_dict = load_dictionaries()

train.pkl


In [16]:
def get_train_test():
  main_dir = "/content/Composers"
  train_dict = {}
  test_dict = {}

  # NOTE: deleted debussy and balakir for lack of data
  percent_split = 0.8

  for file_name in os.listdir(main_dir):
    comp_list = []
    if file_name[0] == '.':
      continue
    for song in os.listdir(main_dir + '/' + file_name):
      comp_list.append(song)
    np.random.shuffle(comp_list)  # shuffle list
    # manual train-test split
    len_dir = len(comp_list)
    eighty = int(len_dir*percent_split)
    if eighty < 1:
      raise ValueError("Need to rewrite percentsplit!")

    train_dict[file_name.split('-')[0]] = comp_list[:eighty]
    test_dict[file_name.split('-')[0]] = comp_list[eighty:]

    filename = "/content/train.pkl"
  with open(filename, 'wb') as f:
    pickle.dump(train_dict, f)

  filename = "/content/test.pkl"
  with open(filename, 'wb') as f:
    pickle.dump(test_dict, f)


In [17]:
def print_params(params):
    jnp.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
    print("initial probs:")
    print(params.initial.probs)
    print("transition matrix:")
    print(params.transitions.transition_matrix)
    print("emission probs:")
    print(params.emissions.probs) 

Now Let's work with the test set

In [22]:
def ComposerAnalysis(comps_list, train_dict = train_dict, test_dict = test_dict,verbose = True):
  """function to analyze a list of composers using provided data in a folder at the directory:
  /content/Composers. Each composers song data should be saved as a csv of the format outputted
  by the midi-csv submodule script titled midi_to_csv.py.
  PARAMETERS:

    comps_list (list): list of the names of composers to run the analysis on as they appear in the
        train dictionary and test dictionary
    train_dict (dictionary): dictionary of composer names to a list of the names of the csvs that contain
        the composer's training songs
    test_dict (dictionary): dictionary of composer names to a list of the names of the csvs that contain that
        composer's test songs
    verbose (bool): whether or not to alert user to progress of program

  RETURNS:
    NONE, but prints the scoring of the test set for Nearest Neighbors and K-Means
    """
  main_dir = "/content/Composers"
  # create labels to be used to identify composers
  comp_labels = {composer:i for i, composer in enumerate(comps_list)}

  # 1) go through all the songs we are training on to get the number of unique notes and durations
  # these will be used in the construction of the HMM model
  notes = set()
  durations = set()
  test_songs = 0
  train_songs = 0

  for composer in comps_list:
    for song in test_dict[composer]:
      test_songs += 1
      arr = csv_to_numpy(composer,song)
      notes.update(arr[:,0])
      durations.update(arr[:,1])
    for song in train_dict[composer]:
      train_songs += 1
      arr = csv_to_numpy(composer,song)
      notes.update(arr[:,0])
      durations.update(arr[:,1])

  if verbose:
    print(f"The total number of notes found was:\t\t{len(notes)}\nThe "
    f"total number of durations found was:\t{len(durations)}\n\n")

  # 2) Train the an HMM model for each song to get the transition matrix for each song

  num_unique_notes = len(notes)                             # for calculating classes
  num_unique_durations = len(durations)                     # for calculating classes
  num_states = 3                                            # dimension of hidden state must be more than num_emmisions. eventually we will grid search this
  num_emmisions = 2                                         # dimension of observations, two because we have note value and durations
  num_classes = num_unique_notes*num_unique_durations       # total number of values the observation can take on 
  param_list_train = []                                     # place to store the parameter objects
  train_labels = []                                         # labels to now what song was from what composer for
                                                            #   clustering later
  # now training
  if verbose:
    print("NOW STARTING TRAINING\n")
    print(f"\tTotal training songs: {train_songs}\n")

  song_progress = 1

  for composer in comps_list:
    for song in train_dict[composer]:
      if verbose:
        print(f"processing song {song_progress}/{train_songs}")
      song_progress += 1
      train_labels.append(comp_labels[composer])
      arr = csv_to_numpy(composer,song)
      if (np.isnan(arr).any()) or (np.isinf(arr).any()):
          print("NaN or Inf")
          print(np.isnan(arr).any())
          print(np.isinf(arr).any())
          print(" at i =", i)
          continue
      hmm = CategoricalHMM(num_states,num_emmisions,num_classes)        # create object
      params, props = hmm.initialize(method="prior")  
      params, log_probs = hmm.fit_em(params,props,arr,num_iters=10)     # fiting training points
      param_list_train.append(params)                                   # saving parameters for that song

  # 3) Processing training data to be used in training classification models
  if verbose:
    print("PROCESSING TRAINING DATA\n")

  train_Fmatrix_list = []
  for params in param_list_train:                                       # extracting transition matrices (F matrices)
    train_Fmatrix_list.append(params.transitions.transition_matrix)

  X_for_training = []   
  for matrix in train_Fmatrix_list:                                          # raveling transition matrices so that each
      X_for_training.append(np.ravel(matrix))                           # can act as a point in the clustering algorithm


  # 4) Training classification models with processed training data
  if verbose:
    print("TRAINING CLASSIFICATION MODELS\n")

  NC = NearestCentroid()                                                # nearest neighbors classifier
  NC.fit(np.array(X_for_training), train_labels)
  KM = KMeans(3)                                                        # KMeans classifier
  KM.fit(np.array(X_for_training), train_labels)


  # 5) Get the test data by getting transition matrices by running the HMM model on the test points

  if verbose:
    print(f"STARTING TEST DATA GENERATION\n")
    print(f"\tTotal test songs: {test_songs}\n")

  param_list_test = []  
  test_labels = []
  song_progress = 0           # to keep track of how much progress in the for loop you've done

  for composer in comps_list:
    for song in test_dict[composer]:
      if verbose:
        print(f"processing test song {song_progress}/{test_songs}")
      song_progress += 1
      test_labels.append(comp_labels[composer])
      arr = csv_to_numpy(composer,song)
      if (np.isnan(arr).any()) or (np.isinf(arr).any()):
          print("NaN or Inf")
          print(np.isnan(arr).any())
          print(np.isinf(arr).any())
          print(" at i =", i)
          continue
      hmm = CategoricalHMM(num_states,num_emmisions,num_classes)
      params, props = hmm.initialize(method="prior")
      params, log_probs = hmm.fit_em(params,props,arr,num_iters=10)
      param_list_test.append(params)

  if verbose:
    print("PROCESSING TEST DATA\n")

  t_matrix_list_test = []
  for params in param_list_test:                # raveling for clustering purposes
      t_matrix_list_test.append(params.transitions.transition_matrix)
  
  X_for_scoring = []                            # place to store ravelled t-matrices of test data for scoring
  for matrix in t_matrix_list_test:
      X_for_scoring.append(np.ravel(matrix))

  # 6) # Get scores based on the test data
  if verbose:
    print("SCORING PROCESSED TEST DATA\n")

  NN_score = NC.score(np.array(X_for_scoring), test_labels)
  KM_score = KM.score(np.array(X_for_scoring), test_labels)

  print(f"RESULTS:\n\nNearest Neighbors Score:\t{NN_score}\nK-Means Score:\t\t{KM_score}")


In [23]:
#comps_list = ['haydn','mozart','schubert']
#ComposerAnalysis(comps_list)

In [24]:
new_list = ['chopin','borodin','beeth','liszt','mendelssohn','schumann']

In [1]:
ComposerAnalysis(new_list)

NameError: ignored