In [2]:
from typing import List, Any, Tuple, Dict
import pandas as pd
import music21
import glob
import logging


logging.basicConfig(level=logging.INFO)
SEQ_LEN = 101
COMPOSITIONS_PER_CLASS = 75

In [3]:
path_to_files: List[str] = glob.glob("./data/**/*.mid")
midi_files: pd.DataFrame = pd.DataFrame.from_dict({
    "Path": path_to_files,
    "IsClassic": [path.split("/")[2] == "classic" for path in path_to_files]
})
midi_files

Unnamed: 0,Path,IsClassic
0,./data/classic/chpn-p20.mid,True
1,./data/classic/scn16_6.mid,True
2,./data/classic/chpn-p12.mid,True
3,./data/classic/bor_ps2.mid,True
4,./data/classic/scn15_1.mid,True
...,...,...
510,./data/undertale/Undertale - Bring It In Guys.mid,False
511,./data/undertale/Undertale - Trouble Dingle.mid,False
512,./data/undertale/Undertale - Can You Really Ca...,False
513,./data/undertale/Undertale - Uwa So HEATS.mid,False


In [4]:
classic_midi_files: pd.DataFrame = midi_files[midi_files["IsClassic"] == True].head(COMPOSITIONS_PER_CLASS)
not_classic_midi_files: pd.DataFrame = midi_files[midi_files["IsClassic"] == False].head(COMPOSITIONS_PER_CLASS)
midi_files = pd.concat([classic_midi_files, not_classic_midi_files])
midi_files

Unnamed: 0,Path,IsClassic
0,./data/classic/chpn-p20.mid,True
1,./data/classic/scn16_6.mid,True
2,./data/classic/chpn-p12.mid,True
3,./data/classic/bor_ps2.mid,True
4,./data/classic/scn15_1.mid,True
...,...,...
346,./data/anime/Lost My Pieces.mid,False
347,./data/anime/Kamado_Tanjirou_no_Uta_full.mid,False
348,./data/anime/Love_Dramatic_full.mid,False
349,./data/anime/[shinmai_maou_no_testament_burst_...,False


In [5]:
class MidiSeqProcessor:
    def __init__(self, seq_len: int):
        self._seq_len: int = seq_len

    def get_sequences(self, score: Any) -> Tuple[List[str], List[str]]:
        """
        :param score: midi score to parse
        :return: sequences list with chords representation and duration
        """
        chords: List[str] = []
        durations: List[str] = []
        try:
            s2 = music21.instrument.partitionByInstrument(score)
            score = s2.parts[0].recurse()
        except:
            score = score.flat.notes
        for element in score:
            if isinstance(element, music21.chord.Chord):
                chords.append("|".join([str(note) for note in element.normalOrder]))
                durations.append(str(element.duration.quarterLength))
            elif isinstance(element, music21.note.Note):
                chords.append(str(element.pitch))
                durations.append(str(element.duration.quarterLength))
            elif isinstance(element, music21.note.Rest):
                chords.append(str(element.name))
                durations.append(str(element.duration.quarterLength))
        chord_sequences: List[str] = []
        duration_sequences: List[str] = []
        for i in range(len(chords) - self._seq_len):
            chord_sequences.append(" ".join(chords[i:i+self._seq_len]))
            duration_sequences.append(" ".join(durations[i:i+self._seq_len]))
        return chord_sequences, duration_sequences

class MusicDataFrameGenerator:
    def __init__(self, seq_len: int):
        self._seq_len: int = seq_len
        self._seq_processor: MidiSeqProcessor = MidiSeqProcessor(seq_len)

    def __call__(self, midi_files: pd.DataFrame) -> pd.DataFrame:
        result: Dict[str: List] = {"ChordSeq": [], "DurationSeq": [], "IsClassic": []}
        for i, path_to_file, is_classic in zip(range(len(midi_files["Path"])), midi_files["Path"], midi_files["IsClassic"]):
            logging.info(f"Processed {i} files from {len(midi_files['Path'])}")
            score = music21.converter.parse(path_to_file).chordify()
            for chord_seq, duration_seq in zip(*self._seq_processor.get_sequences(score)):
                result["ChordSeq"].append(chord_seq)
                result["DurationSeq"].append(duration_seq)
                result["IsClassic"].append(1 if is_classic else 0)
        return pd.DataFrame.from_dict(result)

data_frame_generator: MusicDataFrameGenerator = MusicDataFrameGenerator(SEQ_LEN)
music_data: pd.DataFrame = data_frame_generator(midi_files)
music_data

INFO:root:Processed 0 files from 150
INFO:root:Processed 1 files from 150
INFO:root:Processed 2 files from 150
INFO:root:Processed 3 files from 150
INFO:root:Processed 4 files from 150
INFO:root:Processed 5 files from 150
INFO:root:Processed 6 files from 150
INFO:root:Processed 7 files from 150
INFO:root:Processed 8 files from 150
INFO:root:Processed 9 files from 150
INFO:root:Processed 10 files from 150
INFO:root:Processed 11 files from 150
INFO:root:Processed 12 files from 150
INFO:root:Processed 13 files from 150
INFO:root:Processed 14 files from 150
INFO:root:Processed 15 files from 150
INFO:root:Processed 16 files from 150
INFO:root:Processed 17 files from 150
INFO:root:Processed 18 files from 150
INFO:root:Processed 19 files from 150
INFO:root:Processed 20 files from 150
INFO:root:Processed 21 files from 150
INFO:root:Processed 22 files from 150
INFO:root:Processed 23 files from 150
INFO:root:Processed 24 files from 150
INFO:root:Processed 25 files from 150
INFO:root:Processed 26

Unnamed: 0,ChordSeq,DurationSeq,IsClassic
0,rest 2 2|5 9|0|3|5 2|3|5|9 9|0|2|3|5 9|0|2|3|5...,5.5 0.25 0.25 1.0 1/3 1/6 0.75 1/12 1/6 0.5 1....,1
1,2 2|5 9|0|3|5 2|3|5|9 9|0|2|3|5 9|0|2|3|5 9|0|...,0.25 0.25 1.0 1/3 1/6 0.75 1/12 1/6 0.5 1.5 1....,1
2,2|5 9|0|3|5 2|3|5|9 9|0|2|3|5 9|0|2|3|5 9|0|2|...,0.25 1.0 1/3 1/6 0.75 1/12 1/6 0.5 1.5 1.0 0.2...,1
3,9|0|3|5 2|3|5|9 9|0|2|3|5 9|0|2|3|5 9|0|2|3|5 ...,1.0 1/3 1/6 0.75 1/12 1/6 0.5 1.5 1.0 0.25 0.2...,1
4,2|3|5|9 9|0|2|3|5 9|0|2|3|5 9|0|2|3|5 2|3|5|9 ...,1/3 1/6 0.75 1/12 1/6 0.5 1.5 1.0 0.25 0.25 0....,1
...,...,...,...
134332,2|4 9|2 9|2 2 2|4 7|9 7|9 2|7 4|7 7|9 7|9 2|7 ...,0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0...,0
134333,9|2 9|2 2 2|4 7|9 7|9 2|7 4|7 7|9 7|9 2|7 4|7 ...,0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0...,0
134334,9|2 2 2|4 7|9 7|9 2|7 4|7 7|9 7|9 2|7 4|7 6|9 ...,0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0...,0
134335,2 2|4 7|9 7|9 2|7 4|7 7|9 7|9 2|7 4|7 6|9 6|9 ...,0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0...,0


In [6]:
music_data.to_csv("./data/music_samples.csv", index=False)

In [7]:
path_to_files: List[str] = glob.glob("./data/classic/beethoven/*.mid")
beethoven_files: pd.DataFrame = pd.DataFrame.from_dict({
    "Path": path_to_files,
    "IsClassic": [path.split("/")[2] == "classic" for path in path_to_files]
})


beethoven_data: pd.DataFrame = data_frame_generator(beethoven_files)
beethoven_data

INFO:root:Processed 0 files from 29
INFO:root:Processed 1 files from 29
INFO:root:Processed 2 files from 29
INFO:root:Processed 3 files from 29
INFO:root:Processed 4 files from 29
INFO:root:Processed 5 files from 29
INFO:root:Processed 6 files from 29
INFO:root:Processed 7 files from 29
INFO:root:Processed 8 files from 29
INFO:root:Processed 9 files from 29
INFO:root:Processed 10 files from 29
INFO:root:Processed 11 files from 29
INFO:root:Processed 12 files from 29
INFO:root:Processed 13 files from 29
INFO:root:Processed 14 files from 29
INFO:root:Processed 15 files from 29
INFO:root:Processed 16 files from 29
INFO:root:Processed 17 files from 29
INFO:root:Processed 18 files from 29
INFO:root:Processed 19 files from 29
INFO:root:Processed 20 files from 29
INFO:root:Processed 21 files from 29
INFO:root:Processed 22 files from 29
INFO:root:Processed 23 files from 29
INFO:root:Processed 24 files from 29
INFO:root:Processed 25 files from 29
INFO:root:Processed 26 files from 29
INFO:root:P

Unnamed: 0,ChordSeq,DurationSeq,IsClassic
0,rest 0 8 5 8 0 5 8 0 5 5|8 0|4|7 0|2|4|7 0|2|4...,4.5 1.25 0.25 4.5 1.25 0.25 1.5 1.25 0.25 1.5 ...,1
1,0 8 5 8 0 5 8 0 5 5|8 0|4|7 0|2|4|7 0|2|4|7 0|...,1.25 0.25 4.5 1.25 0.25 1.5 1.25 0.25 1.5 1.5 ...,1
2,8 5 8 0 5 8 0 5 5|8 0|4|7 0|2|4|7 0|2|4|7 0|4|...,0.25 4.5 1.25 0.25 1.5 1.25 0.25 1.5 1.5 2.5 0...,1
3,5 8 0 5 8 0 5 5|8 0|4|7 0|2|4|7 0|2|4|7 0|4|7 ...,4.5 1.25 0.25 1.5 1.25 0.25 1.5 1.5 2.5 0.25 1...,1
4,8 0 5 8 0 5 5|8 0|4|7 0|2|4|7 0|2|4|7 0|4|7 2|...,1.25 0.25 1.5 1.25 0.25 1.5 1.5 2.5 0.25 1/12 ...,1
...,...,...,...
63777,3|7|10 3|7|10 3|7 7|8|0|3 7|10 6|9 7|10 10|3 3...,0.25 0.25 1/12 1/6 0.25 0.25 0.25 0.25 0.25 0....,1
63778,3|7|10 3|7 7|8|0|3 7|10 6|9 7|10 10|3 3|7 rest...,0.25 1/12 1/6 0.25 0.25 0.25 0.25 0.25 0.25 0....,1
63779,3|7 7|8|0|3 7|10 6|9 7|10 10|3 3|7 rest 3|7|10...,1/12 1/6 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0....,1
63780,7|8|0|3 7|10 6|9 7|10 10|3 3|7 rest 3|7|10 8|0...,1/6 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0....,1


In [8]:
class SampleDatasetGenerator:
    def __call__(self, samples: pd.DataFrame) -> pd.DataFrame:
        res: Dict = {"ChordSeq": [], "DurationSeq": [], "NextChord": [], "NextDuration": []}
        for chord_seq, duration_seq in zip(samples["ChordSeq"], samples["DurationSeq"]):
            chords: List[str] = chord_seq.split(" ")
            durations: List[str] = duration_seq.split(" ")
            res["ChordSeq"].append(" ".join(chords[:-1]))
            res["DurationSeq"].append(" ".join(durations[:-1]))
            res["NextChord"].append(chords[-1])
            res["NextDuration"].append(durations[-1])
        return pd.DataFrame.from_dict(res)

class ChordDurationSampleDatasetGenerator:
    def __call__(self, samples: pd.DataFrame) -> pd.DataFrame:
        res: Dict = {"ChordDurationSeq": [], "NextChordDuration": []}
        for chord_seq, duration_seq in zip(samples["ChordSeq"], samples["DurationSeq"]):
            chords: List[str] = chord_seq.split(" ")
            durations: List[str] = duration_seq.split(" ")
            chord_durations: List[str] = []
            for chord, duration in zip(chords, durations):
                chord_durations.append(f"{chord}#{duration}")
            res["ChordDurationSeq"].append(" ".join(chord_durations))
        for chord, duration in zip(samples["NextChord"], samples["NextDuration"]):
            res["NextChordDuration"].append(f"{chord}#{duration}")
        return pd.DataFrame.from_dict(res)

sample_generator: SampleDatasetGenerator = SampleDatasetGenerator()
samples: pd.DataFrame = sample_generator(beethoven_data)
chd_sample_generator: ChordDurationSampleDatasetGenerator = ChordDurationSampleDatasetGenerator()
samples = chd_sample_generator(samples)
samples

Unnamed: 0,ChordDurationSeq,NextChordDuration
0,rest#4.5 0#1.25 8#0.25 5#4.5 8#1.25 0#0.25 5#1...,rest#0.25
1,0#1.25 8#0.25 5#4.5 8#1.25 0#0.25 5#1.5 8#1.25...,1#0.25
2,8#0.25 5#4.5 8#1.25 0#0.25 5#1.5 8#1.25 0#0.25...,rest#0.25
3,5#4.5 8#1.25 0#0.25 5#1.5 8#1.25 0#0.25 5#1.5 ...,1#0.25
4,8#1.25 0#0.25 5#1.5 8#1.25 0#0.25 5#1.5 5|8#1....,rest#0.25
...,...,...
63777,3|7|10#0.25 3|7|10#0.25 3|7#1/12 7|8|0|3#1/6 7...,10#0.25
63778,3|7|10#0.25 3|7#1/12 7|8|0|3#1/6 7|10#0.25 6|9...,3|7|10#1.0
63779,3|7#1/12 7|8|0|3#1/6 7|10#0.25 6|9#0.25 7|10#0...,rest#0.5
63780,7|8|0|3#1/6 7|10#0.25 6|9#0.25 7|10#0.25 10|3#...,2|5|8|10#1.0


In [9]:
samples.to_csv("./data/beethoven_samples.csv", index=False)