In [None]:
import os
import torch
import librosa

import numpy as np

from scipy.stats import zscore

from IPython.display import Audio, display
from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration

In [None]:
slakh_path = "/engram/naplab/shared/Slakh2100/slakh2100_flac_redux"
track_path = "test/Track01881"

In [None]:
audio, sr = librosa.load(os.path.join(slakh_path,track_path,"mix.flac"), sr=None)

In [None]:
Audio(audio,rate=sr)

In [None]:
for aud_file in [file for file in os.listdir(os.path.join(slakh_path,track_path,'stems')) if file.endswith('.flac')]:
    print(aud_file)
    aud, sr = librosa.load(os.path.join(slakh_path,track_path,'stems',aud_file), sr=None)
    display(Audio(aud,rate=sr))

In [None]:
processor = AutoProcessor.from_pretrained("facebook/musicgen-melody")
model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody")

In [None]:
checkpoint_path = "/home/sd3705/music_gen_2024f/audiocraft_output_sd3705/xps/both_indv_mix/checkpoint.th"
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))  # or 'cuda' if GPU is available

In [None]:
model.load_state_dict(checkpoint, strict=False)

In [None]:
aud_input = librosa.load(os.path.join(slakh_path,track_path,'stems','S01.flac'), sr=None)[0]
aud_input = aud_input[int(sr*1.5):int(sr*11.5)]
display(Audio(aud_input,rate=sr))

In [None]:
aud_output = librosa.load(os.path.join(slakh_path,track_path,'stems','S02.flac'), sr=None)[0]
aud_output = aud_output[int(sr*1.5):int(sr*11.5)]
display(Audio(aud_output,rate=sr))

In [None]:
inputs = processor(
    audio=torch.tensor(aud_input),
    sampling_rate=sr,
    text=["Acoustic Guitar"],
    padding=True,
    return_tensors="pt",
)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)

In [None]:
sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [None]:
inputs = processor(
    audio=torch.tensor(aud_input),
    sampling_rate=sr,
    text=["add piano to the track"],
    padding=True,
    return_tensors="pt",
)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)

In [None]:
sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [None]:
inputs = processor(
    audio=torch.tensor(aud_input),
    sampling_rate=sr,
    text=["piano, electro guitar"],
    padding=True,
    return_tensors="pt",
)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)

In [None]:
sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [None]:
inputs = processor(
    audio=torch.tensor(aud_input),
    sampling_rate=sr,
    text=["only piano"],
    padding=True,
    return_tensors="pt",
)
only_piano = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)

In [None]:
Audio(only_piano[0].numpy(), rate=sampling_rate)

In [None]:
aud_input.shape

In [None]:
only_piano.squeeze().numpy().shape

In [None]:
only_guitar = aud_input[:only_piano.shape[0]]
#only_piano = only_piano.squeeze().numpy()

In [None]:
np.mean(only_guitar)

In [None]:
zscore(only_guitar).shape

In [None]:
Audio(zscore(only_guitar), rate=sampling_rate)

In [None]:
Audio(zscore(only_piano)+zscore(only_guitar), rate=sampling_rate)