# humming2music Demo Pipeline
Linear notebook that stitches modules 01-09 together.

Adjust the variables in each cell to run the full pipeline. Dependencies: pydub, librosa, numpy, sounddevice (optional for recording), ffmpeg for pydub/mp3.


In [3]:
import sys
import platform

print("Python exe:", sys.executable)
print("Python version:", platform.python_version())

import sys
print("Using interpreter:", sys.executable)

# 在当前内核对应的 python 里装库
import subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install",
    "librosa", "pydub", "soundfile", "sounddevice",
])


Python exe: /opt/miniconda3/envs/api/bin/python
Python version: 3.10.19
Using interpreter: /opt/miniconda3/envs/api/bin/python
Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting sounddevice
  Using cached sounddevice-0.5.3-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl.metadata (1.6 kB)
Collecting numba>=0.51.0 (from librosa)
  Using cached numba-0.62.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.8 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba>=0.51.0->librosa)
  Using cached llvmlite-0.45.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.8 kB)
Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Using cached sounddevice-0.5.3-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl (108 kB)
Using cached numba-0.62.1-cp310-cp310-macosx_11_0_arm64.whl (2.7 MB)
Using cached llvmlite-0.45.1-cp310-cp310-macosx_11_0_arm64.whl (37.3 MB)
Installing collected packages: pydub, llvmlite, sounddevice, numba
[2K   [90m━━━━━━━━━━━━━━━━━━

0

In [5]:
import importlib.metadata as md

print("librosa:", md.version("librosa"))
print("pydub:", md.version("pydub"))


librosa: 0.10.1
pydub: 0.25.1


In [None]:
# 0. Setup paths and imports
from pathlib import Path
import sys
import librosa, pydub

print("Python exe:", sys.executable)

PROJECT_ROOT = Path('..').resolve()
sys.path.append(str(PROJECT_ROOT))

from src.config import (
    GLOBAL_AUDIO_CONFIG,
    DEFAULT_PREPROCESSING_CONFIG,
    DEFAULT_MELODY_EXTRACTION_CONFIG,
    DEFAULT_MELODY_REPRESENTATION_CONFIG,
    GLOBAL_STYLE_CONFIG,
    DEFAULT_POSTPROCESSING_CONFIG,
    DEFAULT_SIMILARITY_CONFIG,
    RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR,
)

from src.audio_input import AudioInputManager
from src.preprocessing import Preprocessor
from src.melody_extraction import MelodyExtractor
from src.melody_representation import MelodyRepresenter
from src.style_and_model_config import StyleConfigManager
from src.music_generation import MusicGenerator
from src.postprocessing_export import Postprocessor
from src.similarity_evaluation import SimilarityEvaluator

for d in [RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR]:
    Path(d).mkdir(parents=True, exist_ok=True)

print('Project root:', PROJECT_ROOT)
print('Data/raw:', RAW_AUDIO_DIR)
print('Outputs/generated:', GENERATED_AUDIO_DIR)


Python exe: /opt/miniconda3/envs/api/bin/python
Project root: /Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music
Data/raw: /Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/data/raw
Outputs/generated: /Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/outputs/generated


In [7]:
# 1. Audio input (upload or record)
from pathlib import Path

SESSION_ID = "demo_sine"

# 用生成的测试文件
UPLOAD_PATH = RAW_AUDIO_DIR / "test_sine_440.wav"

audio_manager = AudioInputManager()
audio_meta = audio_manager.ingest_upload(UPLOAD_PATH, session_id=SESSION_ID)

audio_meta_dict = audio_meta.to_dict()
audio_meta_dict


{'path': '/Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/data/raw/raw_input_20251130_204558_sessiondemo_sine.wav',
 'duration_sec': 3.0,
 'sample_rate': 16000,
 'channels': 1,
 'format': 'wav',
 'source_type': 'uploaded'}

In [8]:
# 2. Preprocessing
preprocessor = Preprocessor()
pre_meta = preprocessor.preprocess(audio_meta.path)
pre_meta_dict = pre_meta.to_dict()
pre_meta_dict


{'path': '/Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/data/processed/processed_input_20251130_204558_sessiondemo_sine.wav',
 'original_duration_sec': 3.0,
 'processed_duration_sec': 3.0,
 'sample_rate': 16000,
 'applied_steps': ['trim_silence', 'highpass', 'normalize'],
 'notes': ''}

In [9]:
# 3. Melody extraction
extractor = MelodyExtractor()
contour = extractor.extract(pre_meta.path)
contour_dict = contour.to_dict()
{k: contour_dict[k] if k != 'f0_hz' else f'<{len(contour_dict[k])} values>' for k in contour_dict}


  from pkg_resources import resource_filename


{'time': [0.0,
  0.016,
  0.032,
  0.048,
  0.064,
  0.08,
  0.096,
  0.112,
  0.128,
  0.144,
  0.16,
  0.176,
  0.192,
  0.208,
  0.224,
  0.24,
  0.256,
  0.272,
  0.288,
  0.304,
  0.32,
  0.336,
  0.352,
  0.368,
  0.384,
  0.4,
  0.416,
  0.432,
  0.448,
  0.464,
  0.48,
  0.496,
  0.512,
  0.528,
  0.544,
  0.56,
  0.576,
  0.592,
  0.608,
  0.624,
  0.64,
  0.656,
  0.672,
  0.688,
  0.704,
  0.72,
  0.736,
  0.752,
  0.768,
  0.784,
  0.8,
  0.816,
  0.832,
  0.848,
  0.864,
  0.88,
  0.896,
  0.912,
  0.928,
  0.944,
  0.96,
  0.976,
  0.992,
  1.008,
  1.024,
  1.04,
  1.056,
  1.072,
  1.088,
  1.104,
  1.12,
  1.136,
  1.152,
  1.168,
  1.184,
  1.2,
  1.216,
  1.232,
  1.248,
  1.264,
  1.28,
  1.296,
  1.312,
  1.328,
  1.344,
  1.36,
  1.376,
  1.392,
  1.408,
  1.424,
  1.44,
  1.456,
  1.472,
  1.488,
  1.504,
  1.52,
  1.536,
  1.552,
  1.568,
  1.584,
  1.6,
  1.616,
  1.632,
  1.648,
  1.664,
  1.68,
  1.696,
  1.712,
  1.728,
  1.744,
  1.76,
  1.776,
  1.792,
  1

In [10]:
# 4. Melody representation
representer = MelodyRepresenter()
rep = representer.represent(time=contour.time, f0_midi=contour.f0_midi, voiced=contour.voiced)
rep_dict = rep.to_dict()
rep_summary = {
    'notes': len(rep_dict['note_sequence']),
    'tempo_bpm': rep_dict['rhythm_profile'].get('estimated_tempo_bpm'),
}
rep_summary


{'notes': 1, 'tempo_bpm': 0}

In [11]:
# 5. Style selection
style_manager = StyleConfigManager()
available_styles = style_manager.list_styles()
STYLE_NAME = available_styles[0]  # pick first by default
style_config = style_manager.get_style(STYLE_NAME)
style_config.to_dict()


{'name': '8bit',
 'description': 'Retro chiptune with bright square leads and arpeggios.',
 'mood': 'playful',
 'tempo_bpm': 130,
 'instruments': ['square lead', 'noise snare', 'arp'],
 'model_configs': {'stub': {'model_name': 'stub',
   'prompt': 'A playful 8-bit chiptune with bright square leads and retro arpeggios.',
   'max_duration_sec': 20}}}

In [12]:
# 6. Music generation (stub sine generator)
generator = MusicGenerator()
gen_result = generator.generate(melody_representation=rep_dict, style_name=STYLE_NAME)
gen_result_dict = gen_result.to_dict()
gen_result_dict


{'audio_path': '/Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/outputs/generated/gen_8bit_stub_20251130_204903.wav',
 'model_name': 'stub',
 'style_name': '8bit',
 'duration_sec': 4.008000000000003,
 'sample_rate': 16000,
 'generation_metadata': {'prompt': 'A playful 8-bit chiptune with bright square leads and retro arpeggios.',
  'seed': None,
  'timestamp': '20251130_204903'}}

In [13]:
# 7. Post-processing & export
postprocessor = Postprocessor()
post_result = postprocessor.process(gen_result.audio_path, style_name=STYLE_NAME, model_name=gen_result.model_name)
post_result_dict = post_result.to_dict()
post_result_dict


{'final_audio_path': '/Users/bemmgr/pythonProjects/academiaArchive/API_Assignment/humming2music/outputs/final/gen_8bit_stub_20251130_204903_final.wav',
 'final_audio_path_mp3': None,
 'duration_sec': 4.008,
 'sample_rate': 16000,
 'postprocessing_applied': ['normalize', 'fade_in', 'fade_out'],
 'style_name': '8bit',
 'model_name': 'stub'}

In [14]:
# 8. Similarity evaluation (original vs generated)
evaluator = SimilarityEvaluator()
sim_report = evaluator.evaluate(
    original_processed_audio=pre_meta.path,
    generated_audio=post_result.final_audio_path,
    style_name=STYLE_NAME,
    model_name=gen_result.model_name,
)
sim_report.to_dict()


{'style_name': '8bit',
 'model_name': 'stub',
 'pitch_similarity': 1.0,
 'rhythm_similarity': 1.0,
 'overall_similarity': 1.0,
 'metadata': {'dtw_cost': 'euclidean',
  'pitch_weight': 0.7,
  'rhythm_weight': 0.3}}