In [2]:
!pip install numpy pandas tensorflow music21 pydub librosa

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
import numpy as np
import pandas as pd
import os
from music21 import converter, instrument, note, chord, stream
from pydub import AudioSegment
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Activation
from tensorflow.keras.utils import to_categorical

In [5]:
def convert_mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")

mp3_path = '/content/Kcee - Ojapiano (Official Video).mp3'
wav_path = '/content/temp_audio.wav'

convert_mp3_to_wav(mp3_path, wav_path)

In [6]:
def extract_features(wav_path):
    y, sr = librosa.load(wav_path)
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    features = np.concatenate((chroma_stft, rmse, spec_cent, spec_bw, rolloff, zcr, mfcc))
    return features

features = extract_features(wav_path)

In [7]:
sequence_length = 100
n_vocab = features.shape[0]

network_input = []
network_output = []

for i in range(len(features[0]) - sequence_length):
    seq_in = features[:, i:i + sequence_length]
    seq_out = features[:, i + sequence_length]
    network_input.append(seq_in)
    network_output.append(seq_out)

n_patterns = len(network_input)

network_input = np.reshape(network_input, (n_patterns, sequence_length, n_vocab))
network_input = network_input / np.max(network_input)
network_output = np.reshape(network_output, (n_patterns, n_vocab))
network_output = network_output / np.max(network_output)

In [8]:
model = Sequential()
model.add(LSTM(512, input_shape=(network_input.shape[1], network_input.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='mean_squared_error', optimizer='adam')

model.summary()

  super().__init__(**kwargs)


In [9]:
# Train the model
model.fit(network_input, network_output, epochs=10, batch_size=64)

# Save the model
model.save('music_generation_model.h5')

Epoch 1/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m909s[0m 8s/step - loss: 0.0056
Epoch 2/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m893s[0m 8s/step - loss: 0.0021
Epoch 3/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m922s[0m 8s/step - loss: 0.0037
Epoch 4/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m894s[0m 8s/step - loss: 0.0021
Epoch 5/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m908s[0m 8s/step - loss: 0.0020
Epoch 6/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m924s[0m 8s/step - loss: 0.0020
Epoch 7/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m920s[0m 8s/step - loss: 0.0020
Epoch 8/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 8s/step - loss: 0.0020
Epoch 9/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m907s[0m 8s/step - loss: 0.0020
Epoch 10/10
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m935s[0m 8s



In [12]:
# Generate new music
def generate_notes(model, network_input, n_vocab, sequence_length):
    start = np.random.randint(0, len(network_input)-1)
    pattern = network_input[start]
    prediction_output = []

    for note_index in range(500):
        prediction_input = np.reshape(pattern, (1, len(pattern), n_vocab))
        prediction = model.predict(prediction_input, verbose=0)
        prediction_output.append(prediction[0])
        pattern = np.vstack((pattern[1:], prediction))

    return prediction_output

# Example usage
prediction_output = generate_notes(model, network_input, n_vocab, sequence_length)

In [13]:
# Convert prediction to MIDI file
def create_midi(prediction_output, output_file):
    offset = 0
    output_notes = []

    for pattern in prediction_output:
        new_note = note.Note()
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
        offset += 0.5

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp=output_file)

# Example usage
create_midi(prediction_output, 'generated_music.mid')