In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import os

# Path to save outputs
root_path = '/content/drive/MyDrive/AIMusicGenerator/music_voice/'
song_file = os.path.join(root_path, "lalasong.wav")

In [6]:
!pip install pydub
!pip install gtts

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4


In [10]:
import numpy as np
import librosa
import soundfile as sf
from pydub import AudioSegment
from gtts import gTTS
from tqdm import tqdm

# Example melody and lyrics
melody_notes = [60, 62, 64, 65, 67, 69, 71, 72]  # MIDI note numbers
durations = [0.5, 0.5, 1, 1, 0.5, 0.5, 1, 1]    # Durations in beats
lyrics = ["La", "la", "la", "la", "sing", "a", "song", "now"]

def prepare_input(melody_notes, durations, lyrics):
    # Ensure the lengths match
    assert len(melody_notes) == len(durations) == len(lyrics), "Input lengths must match."

    input_data = []
    for note, duration, lyric in zip(melody_notes, durations, lyrics):
        input_data.append({"note": note, "duration": duration, "lyric": lyric})
    return input_data

input_data = prepare_input(melody_notes, durations, lyrics)
print("Prepared input data:", input_data)

def synthesize_singing(input_data, output_file="singing.wav", sr=16000):
    """
    Synthesize a singing voice by combining TTS for lyrics and pitch adjustment for melody.

    Parameters:
    - input_data: List of dictionaries with note, duration, and lyric.
    - output_file: Name of the output WAV file.
    - sr: Sampling rate for audio.
    """
    generated_audio = []

    for data in tqdm(input_data, desc="Generating singing audio"):
        note = data["note"]
        duration = data["duration"]
        lyric = data["lyric"]

        # Generate TTS audio for the lyric
        tts = gTTS(lyric, lang="en")
        temp_mp3 = "temp.mp3"
        temp_wav = "temp.wav"
        tts.save(temp_mp3)

        # Convert MP3 to WAV
        tts_audio = AudioSegment.from_file(temp_mp3, format="mp3")
        tts_audio.export(temp_wav, format="wav")

        # Load the TTS WAV file
        tts_wav, _ = librosa.load(temp_wav, sr=sr)

        # Adjust pitch to match the note
        pitch_shifted = librosa.effects.pitch_shift(tts_wav, sr=sr, n_steps=note - 60)

        # Repeat audio to match duration
        num_samples = int(sr * duration)
        pitch_shifted = np.tile(pitch_shifted, int(np.ceil(num_samples / len(pitch_shifted))))[:num_samples]

        generated_audio.append(pitch_shifted)

    # Combine all audio clips
    final_audio = np.concatenate(generated_audio)
    sf.write(output_file, final_audio, samplerate=sr)
    print(f"Singing voice saved as {output_file}")

# Generate the singing audio
output_file = song_file
synthesize_singing(input_data, output_file=output_file)


Prepared input data: [{'note': 60, 'duration': 0.5, 'lyric': 'La'}, {'note': 62, 'duration': 0.5, 'lyric': 'la'}, {'note': 64, 'duration': 1, 'lyric': 'la'}, {'note': 65, 'duration': 1, 'lyric': 'la'}, {'note': 67, 'duration': 0.5, 'lyric': 'sing'}, {'note': 69, 'duration': 0.5, 'lyric': 'a'}, {'note': 71, 'duration': 1, 'lyric': 'song'}, {'note': 72, 'duration': 1, 'lyric': 'now'}]


Generating singing audio: 100%|██████████| 8/8 [00:02<00:00,  3.80it/s]


Singing voice saved as /content/drive/MyDrive/AIMusicGenerator/music_voice/lalasong.wav
