In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Install Packages

In [None]:
!pip install --quiet music21

In [None]:
!apt-get update && apt-get install -qqy abcmidi lilypond timidity musescore3

# Import Libraries

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from collections import Counter

import tqdm
import emoji
import music21

from music21 import converter, instrument, note, chord, stream

import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, regularizers, metrics, initializers, constraints
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

from IPython.display import Markdown, Image, Audio

def bold(string):
    display(Markdown("**" + string + "**"))

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
root_dir = '/kaggle/input/classical-music-midi/'

In [None]:
midi_files = []
midi_labels = []
for folder in tqdm.tqdm(os.listdir(root_dir)):
    files = os.listdir(os.path.join(root_dir, folder))
    for file in files:
        if file.endswith(".mid"):
            midi_files.append(os.path.join(root_dir, folder, file))
            midi_labels.append(folder)

In [None]:
df = pd.DataFrame({
    "midi_file": midi_files,
    "label": midi_labels
})

df.head()

# EDA

In [None]:
df.label.value_counts().plot(kind="pie", autopct="%.1f%%")

In [None]:
tags = Counter(df["label"]).keys()
tags_len = Counter(df["label"]).values()
tag_df = pd.DataFrame(zip(tags, tags_len), columns=["Class", "Count"]).sort_values(by='Count', ascending=False)
tag_df.plot(x="Class", y="Count", kind="bar", legend=False, grid=False, figsize=(12, 5), cmap='viridis')
plt.title("Class / Count", fontsize=18)
plt.xlabel("Class", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

In [None]:
def visualize_midi(file_path):
    midi_data = converter.parse(file_path)

    notes = []
    for part in instrument.partitionByInstrument(midi_data).parts:
        for element in part.recurse():
            if isinstance(element, note.Note):
                notes.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                notes.append(".".join(str(n) for n in element.normalOrder))

    melody = []
    offset = 0
    for item in notes[:100]:
        if "." in item or item.isdigit():
            chord_notes = [note.Note(int(n)) for n in item.split(".")]
            chord_snip = chord.Chord(chord_notes)
            chord_snip.offset = offset
            melody.append(chord_snip)
        else:
            note_snip = note.Note(item)
            note_snip.offset = offset
            melody.append(note_snip)
        offset += 1

    melody_stream = stream.Stream(melody)
    melody_stream.show()

In [None]:
sample = df.sample()
print(f"Notes on {sample[['label']].values[0][0]} - {sample[['midi_file']].values[0][0].split('/')[-1].replace('.mid', '')}")
visualize_midi(sample[["midi_file"]].values[0][0])

# Preprocessing

In [None]:
chopin_midi_files = [midi_file[0] for midi_file in df[df["label"] == "chopin"][["midi_file"]].values]

In [None]:
chopin_notes = []

for midi_file in tqdm.tqdm(chopin_midi_files):
    midi_data = converter.parse(midi_file)
    for part in instrument.partitionByInstrument(midi_data).parts:
        for element in part.recurse():
            if isinstance(element, note.Note):
                chopin_notes.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                chopin_notes.append(".".join(str(n) for n in element.normalOrder))

In [None]:
tags = Counter(chopin_notes).keys()
tags_len = Counter(chopin_notes).values()
tag_df = pd.DataFrame(zip(tags, tags_len), columns=["Class", "Count"]).sort_values(by="Count", ascending=False)[:25]
tag_df.plot(x="Class", y="Count", kind="bar", legend=False, grid=False, figsize=(12, 5), cmap='viridis')
plt.title("Most Frequent Chords", fontsize=18)
plt.xlabel("Class", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

In [None]:
tag_df = pd.DataFrame(zip(tags, tags_len), columns=["Class", "Count"]).sort_values(by="Count", ascending=True)[:25]
tag_df.plot(x="Class", y="Count", kind="bar", legend=False, grid=False, figsize=(12, 5), cmap='viridis')
plt.title("Least Frequent Chords", fontsize=18)
plt.xlabel("Class", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

In [None]:
chord_counter = Counter(chopin_notes)

In [None]:
print('Before Cleaning:', len(chopin_notes))

In [None]:
rare = []

for key, value in chord_counter.items():
    if value < 100:
        rare.append(key)

In [None]:
corpus = [note for note in chopin_notes if note not in rare]

In [None]:
print('After Cleaning:', len(corpus))

In [None]:
symbols = sorted(set(corpus))
num_symbols = len(symbols)
mapping = {c: i for i, c in enumerate(symbols)}
reverse_mapping = {i: c for i, c in enumerate(symbols)}

In [None]:
print(f"Total number of characters: {len(corpus)}")
print(f"Number of unique characters: {num_symbols}")

In [None]:
sequence_length = 40
features = []
targets = []
for i in range(len(corpus) - sequence_length):
    feature_seq = corpus[i:i + sequence_length]
    target_char = corpus[i + sequence_length]
    features.append([mapping[char] for char in feature_seq])
    targets.append(mapping[target_char])

In [None]:
features = np.array(features)
targets = np.array(targets)

In [None]:
X = features.reshape((features.shape[0], features.shape[1], 1)) / float(num_symbols)
y = to_categorical(targets)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Model

In [None]:
model = models.Sequential([
    layers.LSTM(512, input_shape=(40, 1), return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(512, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(256),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dense(127, activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adamax(learning_rate=0.01))

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True, show_layer_names=True)

# Train

In [None]:
history = model.fit(X_train, y_train, batch_size=256, epochs=100)

# Results

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
plt.figure()
plt.plot(history.history["loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["train"])
plt.title("Loss Curve")
plt.show()

# Generate

In [None]:
def music_generator(music_len):
    seed = X_test[np.random.randint(0, len(X_test) - 1)]
    music = []
    for _ in range(music_len):
        seed = seed.reshape(1, sequence_length, 1)
        pred = model.predict(seed, verbose=0)[0]
        index = np.argmax(np.exp(np.log(pred) / 1.0) / np.sum(np.exp(np.log(pred) / 1.0)))
        music.append(reverse_mapping[index])
        seed = np.append(seed[0][1:], index / float(len(symbols))).reshape(1, sequence_length, 1)

    melody = []
    for offset, item in enumerate(music):
        if "." in item or item.isdigit():
            chord_notes = [note.Note(int(n)) for n in item.split(".")]
            melody.append(chord.Chord(chord_notes, offset=offset))
        else:
            melody.append(note.Note(item, offset=offset))

    melody_stream = stream.Stream(melody)
    melody_stream.show()

    return melody, melody_stream

In [None]:
melody, melody_stream = music_generator(250)

In [None]:
melody_stream.write('midi', 'generated.mid')

In [None]:
!timidity generated.mid -Ow generated.wav

In [None]:
Audio('generated.wav')