In [1]:
import os, json
import numpy as np
from pathlib import Path
from time import time
import torch

from utils.audio_segment_utils import segment_audio
from utils.riffusion_utils import audio_array_to_image
from cnet_riff_preprocessing import create_prompt_file, append_to_prompt_file, generate_and_save_control
from utils import spleeter_utils


############################################################################################################################################
"""
PARAMETERS (edit this section)
"""
############################################################################################################################################
opt = {}
# for control methods, choose any combination from: "fullspec", "canny", "thresh", "bpf", "sobel", "sobeldenoise"
opt["control_methods"] = ["fullspec", "canny", "sobel", "sobeldenoise"]
# where to load data from 
opt["root_data_dir"] = os.path.join('../','gtzan')
opt["raw_audio_dir"] = os.path.join(opt["root_data_dir"],'raw-audio')
opt["prompt_labels_filepath"] = os.path.join(opt["root_data_dir"],'prompt_labels.json')

# where to save data to
opt["data_root"] = os.path.join('gtzan-preprocessed')
# true to print information about preprocessing as script runs
opt["verbose"] = True
# true to wipe anything present in existing prompt files
opt["new_prompt_files"] = False


# parameters for control generation (if needed)
opt["fs"] = 44100
opt["canny_low_thresh"] = 150
opt["canny_high_thresh"] = 200
opt["denoise_h"] = 15

# parameters for framing
opt["frame_overlap"] = 0 # percentage of frames overlapped, between 0 and 1
opt["frame_len_seconds"] = 5.110 # length of frame, in seconds
opt["frame_min_power_prop"] = 0.4 # minimum ratio between power in frame and power of stem, between 0 and 1
############################################################################################################################################

"""
From a dictionary of numpy arrays of the source and generated stems, make the trianing example desired.
"""
def make_train_example(source_arr, target_arr, prompt, audio_filename, ex_no, opt):

    # path naming 
    train_example_name = f'{audio_filename}_e{ex_no}.jpg'
    target_path = os.path.join(opt["target_root"], train_example_name)

    # mix target stems and generate/save spectrogram
    target_spectrogram = audio_array_to_image(target_arr, 
                                  save_img=True,
                                  outpath=target_path[:-4],
                                  sample_rate=opt["fs"],
                                  device=opt["device"],
                                  image_extension="jpg")
    
    # mix source stems and make spectrogram
    source_spectrogram = audio_array_to_image(source_arr, 
                                  save_img=False,
                                  outpath="",
                                  sample_rate=opt["fs"],
                                  device=opt["device"])

    for control_method in opt["control_methods"]:
        source_path = os.path.join(opt["data_root"], "source-"+control_method, train_example_name)
        # generate control example for each method desired
        generate_and_save_control(source_spectrogram, source_path, control_method, opt)
        # add source example to respective prompt file
        append_to_prompt_file(opt["data_root"], [source_path], [target_path], prompt, prompt_filename=f"prompt-{control_method}.json", verbose=False)

    if opt["verbose"]:
        print(f"     {ex_no} - prompt: {prompt}")
    ex_no += 1
    return ex_no

# tracking
num_examples_total = 0
time_start = time()

# cuda if possible
if torch.cuda.is_available():
    opt["device"] = "cuda"
else:
    opt["device"] = "cpu"

# control/target data folders
opt["control_roots"] = [os.path.join(opt["data_root"], "source-"+mthd) for mthd in opt["control_methods"]]
opt["target_root"] = os.path.join(opt["data_root"], 'target')

# make all directories needed
os.makedirs(opt["data_root"], exist_ok=True)
for control_root in opt["control_roots"]:
    os.makedirs(control_root, exist_ok=True)
os.makedirs(opt["target_root"], exist_ok=True)

# get all data examples
audio_files = os.listdir(opt["raw_audio_dir"])

# get all prompts in prompt_file as dictionary
prompt_dict = {}
p_count = 0
with open(opt["prompt_labels_filepath"], 'r') as prompt_file:
    for line in prompt_file:
        data = json.loads(line)
        prompt_dict[data['file']] = data['prompt']
        p_count += 1
if opt["verbose"]: print(f"Read {p_count} prompts from prompt_file.json.")


  from .autonotebook import tqdm as notebook_tqdm


Read 178 prompts from prompt_file.json.


In [7]:
num_file = 0
audio_file = audio_files[num_file]

if opt["verbose"]:
    print(f"AUDIO FILE {num_file}/{len(audio_files)}:")

audio_filename = audio_file[:audio_file.index(".wav")]

# audio splitting
splits = spleeter_utils.separate_audio(os.path.join(opt["raw_audio_dir"], audio_file), fs=opt["fs"], stem_num=2)
accompaniment_audio = splits['accompaniment']
full_audio = splits['full_audio']
vocal_audio = splits['vocals']

AUDIO FILE 0/178:
INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models/2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Apply unet for vocals_spectrogram


Exception ignored in: <generator object Estimator.predict at 0x17b6510b0>
Traceback (most recent call last):
  File "/Users/zachary/miniconda3/envs/mel-gen/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 650, in predict
    yield {
  File "/Users/zachary/miniconda3/envs/mel-gen/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/Users/zachary/miniconda3/envs/mel-gen/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 5874, in get_controller
    yield g
  File "/Users/zachary/miniconda3/envs/mel-gen/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/Users/zachary/miniconda3/envs/mel-gen/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 5684, in get_controller
    raise AssertionError(
AssertionError: Nesting violated for default stack of <class 'tensorflow.python.framework.ops.Graph'> objects


INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from pretrained_models/2stems/model
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [9]:
full_audio_segments = segment_audio(full_audio, fs=opt["fs"], num_segments=5, pitch_augment=True)
accompaniment_audio_segments = segment_audio(accompaniment_audio, fs=opt["fs"], num_segments=5, pitch_augment=True)
vocal_audio_segments = segment_audio(vocal_audio, fs=opt["fs"], num_segments=5, pitch_augment=True)

In [10]:
# remove segments with low vocal power
acceptable_inds = []
for i, accompaniment_audio_segment in enumerate(accompaniment_audio_segments):
        if np.linalg.norm(vocal_audio_segments[i]) > np.linalg.norm(accompaniment_audio_segment)*0.1:
            acceptable_inds.append(i)
        else:
            if opt["verbose"]: print("    Vocals not detected in segement " + str(i))
full_audio_segments = full_audio_segments[acceptable_inds]
accompaniment_audio_segments = accompaniment_audio_segments[acceptable_inds]
vocal_audio_segments = vocal_audio_segments[acceptable_inds]

In [11]:
if opt["verbose"]:
    print(f"  Total number of segments for {audio_filename}: {full_audio_segments.shape[0]}")

  Total number of segments for rock.00011: 35


In [12]:
 # get prompt
if audio_file in prompt_dict:
    song_prompt = prompt_dict[audio_file]
    #10% of the time, also say given background
    if np.random.rand() < 0.1:
            song_prompt = "Given background audio, " + song_prompt
else:
    song_prompt = "Generate a vocal melody."

# make training example for each frame
ex_no = 0
for i in range(len(full_audio_segments)):
    if opt["verbose"]:
        print(f"  EX {i}/{len(full_audio_segments)}:")
    # generate vocal melody from background
    ex_no = make_train_example(source_arr=accompaniment_audio_segments[i],
                        target_arr=full_audio_segments[i],
                        prompt=song_prompt,
                        audio_filename=audio_filename,
                        ex_no=ex_no,
                        opt=opt)

  EX 0/35:
     0 - prompt: Generate an up-tempo female vocal rock melody.
  EX 1/35:
     1 - prompt: Generate an up-tempo female vocal rock melody.
  EX 2/35:
     2 - prompt: Generate an up-tempo female vocal rock melody.
  EX 3/35:
     3 - prompt: Generate an up-tempo female vocal rock melody.
  EX 4/35:
     4 - prompt: Generate an up-tempo female vocal rock melody.
  EX 5/35:
     5 - prompt: Generate an up-tempo female vocal rock melody.
  EX 6/35:
     6 - prompt: Generate an up-tempo female vocal rock melody.
  EX 7/35:
     7 - prompt: Generate an up-tempo female vocal rock melody.
  EX 8/35:
     8 - prompt: Generate an up-tempo female vocal rock melody.
  EX 9/35:
     9 - prompt: Generate an up-tempo female vocal rock melody.
  EX 10/35:
     10 - prompt: Generate an up-tempo female vocal rock melody.
  EX 11/35:
     11 - prompt: Generate an up-tempo female vocal rock melody.
  EX 12/35:
     12 - prompt: Generate an up-tempo female vocal rock melody.
  EX 13/35:
     13