# humming2music Demo Pipeline
Linear notebook that stitches modules 01-09 together.

Adjust the variables in each cell to run the full pipeline. Dependencies: pydub, librosa, numpy, sounddevice (optional for recording), ffmpeg for pydub/mp3.


In [1]:
import sys
import platform

print("Python exe:", sys.executable)
print("Python version:", platform.python_version())

import sys
print("Using interpreter:", sys.executable)

# Âú®ÂΩìÂâçÂÜÖÊ†∏ÂØπÂ∫îÁöÑ python ÈáåË£ÖÂ∫ì
import subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install",
    "librosa", "pydub", "soundfile", "sounddevice",
])


Python exe: /opt/homebrew/Caskroom/miniconda/base/envs/music_gen/bin/python
Python version: 3.10.15
Using interpreter: /opt/homebrew/Caskroom/miniconda/base/envs/music_gen/bin/python


0

In [2]:
import importlib.metadata as md

print("librosa:", md.version("librosa"))
print("pydub:", md.version("pydub"))


librosa: 0.11.0
pydub: 0.25.1


In [3]:
# 0. Setup paths and imports
from pathlib import Path
import sys
import librosa, pydub

print("Python exe:", sys.executable)

PROJECT_ROOT = Path('..').resolve()
sys.path.append(str(PROJECT_ROOT))

from src.config import (
    GLOBAL_AUDIO_CONFIG,
    DEFAULT_PREPROCESSING_CONFIG,
    DEFAULT_MELODY_EXTRACTION_CONFIG,
    DEFAULT_MELODY_REPRESENTATION_CONFIG,
    GLOBAL_STYLE_CONFIG,
    DEFAULT_POSTPROCESSING_CONFIG,
    DEFAULT_SIMILARITY_CONFIG,
    RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR,
)

from src.audio_input import AudioInputManager
from src.preprocessing import Preprocessor
from src.melody_extraction import MelodyExtractor
from src.melody_representation import MelodyRepresenter
from src.style_and_model_config import StyleConfigManager
from src.music_generation import MusicGenerator
from src.postprocessing_export import Postprocessor
from src.similarity_evaluation import SimilarityEvaluator

for d in [RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR]:
    Path(d).mkdir(parents=True, exist_ok=True)

print('Project root:', PROJECT_ROOT)
print('Data/raw:', RAW_AUDIO_DIR)
print('Outputs/generated:', GENERATED_AUDIO_DIR)

Python exe: /opt/homebrew/Caskroom/miniconda/base/envs/music_gen/bin/python


scikit-learn version 1.7.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.5.1. Disabling scikit-learn conversion API.
Torch version 2.9.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.7.0 is the most recent version that has been tested.
  warn(
  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


Project root: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music
Data/raw: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/raw
Outputs/generated: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/generated


In [4]:
# 1. Audio input (upload or record)
from pathlib import Path

SESSION_ID = "demo_sine"

# Áî®ÁîüÊàêÁöÑÊµãËØïÊñá‰ª∂
UPLOAD_PATH = "/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/raw/womanhumming.mp3"

audio_manager = AudioInputManager()
audio_meta = audio_manager.ingest_upload(UPLOAD_PATH, session_id=SESSION_ID)

audio_meta_dict = audio_meta.to_dict()
audio_meta_dict


{'path': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/raw/raw_input_20251201_231802_sessiondemo_sine.wav',
 'duration_sec': 19.722,
 'sample_rate': 16000,
 'channels': 1,
 'format': 'wav',
 'source_type': 'uploaded'}

In [5]:
# 2. Preprocessing
preprocessor = Preprocessor()
pre_meta = preprocessor.preprocess(audio_meta.path)
pre_meta_dict = pre_meta.to_dict()
pre_meta_dict


{'path': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav',
 'original_duration_sec': 19.722,
 'processed_duration_sec': 18.415,
 'sample_rate': 16000,
 'applied_steps': ['trim_silence', 'highpass', 'normalize'],
 'notes': ''}

In [6]:
# 3. Melody extraction
extractor = MelodyExtractor()
contour = extractor.extract(pre_meta.path)
contour_dict = contour.to_dict()
print(f"Generated MIDI: {contour_dict['midi_path']}")
contour_dict['metadata']



Predicting MIDI for /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav...
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32


  Creating midi...
  üíÖ Saved to /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine_basic_pitch.mid
Generated MIDI: /Users/xijiecao/Desktop/course/s3

{'extractor': 'basic_pitch (onnx)',
 'source_audio': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav'}

In [7]:
# 4. Melody representation
representer = MelodyRepresenter()

rep = representer.represent(midi_path=contour.midi_path)
rep_dict = rep.to_dict()

rep_summary = {
    'notes': len(rep_dict['note_sequence']),
    'tempo_bpm': rep_dict['rhythm_profile'].get('estimated_tempo_bpm'),
}
rep_summary


{'notes': 52, 'tempo_bpm': 183.91386953768213}

In [8]:
# 5. Style selection
style_manager = StyleConfigManager()
available_styles = style_manager.list_styles()
STYLE_NAME = available_styles[1]  # pick first by default
style_config = style_manager.get_style(STYLE_NAME)
style_config.to_dict()


{'name': 'ambient',
 'description': 'Ethereal ambient soundscape with pads and evolving textures.',
 'mood': 'calm',
 'tempo_bpm': 70,
 'instruments': ['pads', 'drones', 'textures'],
 'model_configs': {'stub': {'model_name': 'stub',
   'prompt': 'A calm ambient soundscape with airy pads and evolving textures.',
   'max_duration_sec': 30}}}

In [9]:
# 6. Music generation 

generator = MusicGenerator(model_size='melody', device='cpu') 

if hasattr(style_config, 'to_dict'):
    style_data = style_config.to_dict()
else:
    style_data = style_config

prompt = style_data.get('description', f"A song in {STYLE_NAME} style")
print(f"Generating with prompt: {prompt}")

gen_result = generator.generate(
    melody_representation=rep_dict, 
    melody_audio_path=pre_meta.path, 
    style_name=STYLE_NAME,
    prompt_text=prompt,
    duration_sec=10 
)

gen_result.to_dict()

You are using a model of type musicgen_melody to instantiate a model of type musicgen. This is not supported for all configurations of models and can yield errors.
  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:11<00:00,  5.58s/it]
Some weights of MusicgenForConditionalGeneration were not initialized from the model checkpoint at facebook/musicgen-melody and are newly initialized: ['decoder.model.decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.model.decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.model.decoder.layers.1.encoder_attn.k_proj.weight', 'deco

Generating with prompt: Ethereal ambient soundscape with pads and evolving textures.




{'model_name': 'musicgen-transformers',
 'style_name': 'ambient',
 'audio_path': 'outputs/generated/gen_ambient_20251201_232336.wav',
 'duration_sec': 10,
 'generation_metadata': {'prompt': 'Ethereal ambient soundscape with pads and evolving textures.'}}

In [10]:
# 7. Post-processing & export
postprocessor = Postprocessor()
post_result = postprocessor.process(gen_result.audio_path, style_name=STYLE_NAME, model_name=gen_result.model_name)
post_result_dict = post_result.to_dict()
post_result_dict


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'final_audio_path': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/final/gen_ambient_20251201_232336_final.wav',
 'final_audio_path_mp3': None,
 'duration_sec': 29.22,
 'sample_rate': 16000,
 'postprocessing_applied': ['normalize', 'fade_in', 'fade_out'],
 'style_name': 'ambient',
 'model_name': 'musicgen-transformers'}

In [11]:
# 8. Similarity evaluation (original vs generated)

evaluator = SimilarityEvaluator()

print(f"Comparing Original: {pre_meta.path}")
print(f"     vs Generated: {post_result.final_audio_path}")

sim_report = evaluator.evaluate(
    original_processed_audio=pre_meta.path,       
    generated_audio=post_result.final_audio_path, 
    style_name=STYLE_NAME,
    model_name=gen_result.model_name,
)

sim_report.to_dict()


Comparing Original: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav
     vs Generated: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/final/gen_ambient_20251201_232336_final.wav


{'style_name': 'ambient',
 'model_name': 'musicgen-transformers',
 'pitch_similarity': 0.8784857799686372,
 'rhythm_similarity': 0.8784857799686372,
 'overall_similarity': 0.8784857799686372,
 'metadata': {'method': 'Audio Chroma DTW (Cosine)'}}