Key,Velocity,Note Density,Note Length

In [1]:
import os
import pretty_midi
import pandas as pd
from tqdm.notebook import tqdm

## DataLoad

In [2]:
train = pd.read_csv("../dataset/split/train.csv", index_col=0)
val= pd.read_csv("../dataset/split/val.csv", index_col=0)
test= pd.read_csv("../dataset/split/test.csv", index_col=0)
train_val = pd.concat([train, val], axis=0)

In [3]:
def get_midi_df(fn):
    midi_data = pretty_midi.PrettyMIDI(fn)
    midi_data.remove_invalid_notes()
    midi_list = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            start = note.start
            end = note.end
            pitch = note.pitch
            velocity = note.velocity
            midi_list.append([start, end, pitch, velocity])
            
    midi_list = sorted(midi_list, key=lambda x: (x[0], x[2]))
    midi_df = pd.DataFrame(midi_list, columns=['start', 'end', 'pitch', 'velocity'])
    midi_df['note_length'] = midi_df['end'] - midi_df['start']
    return midi_df

In [4]:
def get_song_level_feature(fn):
    results = {}
    midi_df = get_midi_df(fn)
    results['note_length_mean'] = midi_df['note_length'].mean()
    results['velocity_mean'] = midi_df['velocity'].mean()
    return results

In [5]:
def _feature_extractor(train_val):
    feature_list = []
    for fname in train_val.index:
        midi_features = get_song_level_feature(os.path.join(midi_path, fname + ".mid"))
        matlab_features = pd.read_csv(os.path.join(matlab_path, fname + ".csv"))
        midi_features['keyname'] = matlab_features['keyname'].item()
        midi_features['beatNoteDensity'] = matlab_features['beatNoteDensity'].item()
        feature_list.append(midi_features)
    return feature_list

In [6]:
midi_path = "../../dataset/PEmoDataset/midis/"
matlab_path = "../dataset/matlab_feature/"
train_val_list = _feature_extractor(train_val)
test_list = _feature_extractor(test)

# ML Classifier

In [7]:
import numpy as np
import matplotlib.pyplot as plt  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix

In [8]:
def _get_labels(label_quad, cls_type):
    if cls_type == "AV":
        results = np.array(label_quad)
    elif cls_type == "A":
        labels = []
        for label in label_quad:
            if label in ['Q1','Q2']:
                labels.append('HA')
            elif label in ['Q3','Q4']:
                labels.append('LA')
        results = np.array(labels)
    elif cls_type == "V":
        labels = []
        for label in label_quad:
            if label in ['Q1','Q4']:
                labels.append('HV')
            elif label in ['Q2','Q3']:
                labels.append('LV') 
        results = np.array(labels)
    return results

def _get_train_test(train_val_list, test_list, cls_type = "AV"):
    m_x_train = pd.DataFrame(train_val_list)
    m_x_train = pd.concat([m_x_train, pd.get_dummies(m_x_train.keyname)],axis=1).drop(columns=['keyname'])
    m_x_test = pd.DataFrame(test_list)
    m_x_test = pd.concat([m_x_test, pd.get_dummies(m_x_test.keyname)],axis=1).drop(columns=['keyname'])
    
    m_y_train = list(train_val['label'])
    m_y_test = list(test['label'])
    m_y_train = _get_labels(m_y_train, cls_type)  
    m_y_test = _get_labels(m_y_test, cls_type)  
    return m_x_train, m_y_train, m_x_test, m_y_test

In [9]:
m_x_train, m_y_train, m_x_test, m_y_test = _get_train_test(train_val_list, test_list, cls_type = "AV")
clf = LogisticRegression(random_state=42).fit(m_x_train, m_y_train)
clf.score(m_x_test, m_y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.660377358490566

In [10]:
m_x_train, m_y_train, m_x_test, m_y_test = _get_train_test(train_val_list, test_list, cls_type = "A")
clf = LogisticRegression(random_state=42).fit(m_x_train, m_y_train)
clf.score(m_x_test, m_y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9245283018867925

In [11]:
m_x_train, m_y_train, m_x_test, m_y_test = _get_train_test(train_val_list, test_list, cls_type = "V")
clf = LogisticRegression(random_state=42).fit(m_x_train, m_y_train)
clf.score(m_x_test, m_y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6509433962264151

In [13]:
m_x_train.columns

Index(['note_length_mean', 'velocity_mean', 'beatNoteDensity', 'A', 'A#', 'B',
       'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'a', 'a#', 'b', 'c',
       'c#', 'd', 'd#', 'e', 'f', 'f#', 'g', 'g#'],
      dtype='object')