# humming2music Demo Pipeline
Linear notebook that stitches modules 01-09 together.

Adjust the variables in each cell to run the full pipeline. Dependencies: pydub, librosa, numpy, sounddevice (optional for recording), ffmpeg for pydub/mp3.


In [None]:
import sys
import platform

print("Python exe:", sys.executable)
print("Python version:", platform.python_version())

import sys
print("Using interpreter:", sys.executable)

# Install packages in the current kernel's Python
import subprocess
subprocess.check_call([
    sys.executable, "-m", "pip", "install",
    "librosa", "pydub", "soundfile", "sounddevice",
])

In [2]:
import importlib.metadata as md

print("librosa:", md.version("librosa"))
print("pydub:", md.version("pydub"))


librosa: 0.11.0
pydub: 0.25.1


In [3]:
# 0. Setup paths and imports
from pathlib import Path
import sys
import librosa, pydub

print("Python exe:", sys.executable)

PROJECT_ROOT = Path('..').resolve()
sys.path.append(str(PROJECT_ROOT))

from src.config import (
    GLOBAL_AUDIO_CONFIG,
    DEFAULT_PREPROCESSING_CONFIG,
    DEFAULT_MELODY_EXTRACTION_CONFIG,
    DEFAULT_MELODY_REPRESENTATION_CONFIG,
    GLOBAL_STYLE_CONFIG,
    DEFAULT_POSTPROCESSING_CONFIG,
    DEFAULT_SIMILARITY_CONFIG,
    RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR,
)

from src.audio_input import AudioInputManager
from src.preprocessing import Preprocessor
from src.melody_extraction import MelodyExtractor
from src.melody_representation import MelodyRepresenter
from src.style_and_model_config import StyleConfigManager
from src.music_generation import MusicGenerator
from src.postprocessing_export import Postprocessor
from src.similarity_evaluation import SimilarityEvaluator

for d in [RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, GENERATED_AUDIO_DIR, POSTPROCESSED_AUDIO_DIR, EVAL_OUTPUT_DIR]:
    Path(d).mkdir(parents=True, exist_ok=True)

print('Project root:', PROJECT_ROOT)
print('Data/raw:', RAW_AUDIO_DIR)
print('Outputs/generated:', GENERATED_AUDIO_DIR)

Python exe: /opt/homebrew/Caskroom/miniconda/base/envs/music_gen/bin/python


scikit-learn version 1.7.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.5.1. Disabling scikit-learn conversion API.
Torch version 2.9.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.7.0 is the most recent version that has been tested.
  warn(
  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


Project root: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music
Data/raw: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/raw
Outputs/generated: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/generated


In [None]:
# 1. Audio input (upload or record)
from pathlib import Path

SESSION_ID = "demo_melody"

# Use test audio file
UPLOAD_PATH = RAW_AUDIO_DIR / "Lraw.m4a"

audio_manager = AudioInputManager()
audio_meta = audio_manager.ingest_upload(UPLOAD_PATH, session_id=SESSION_ID)

audio_meta_dict = audio_meta.to_dict()
audio_meta_dict

In [5]:
# 2. Preprocessing
preprocessor = Preprocessor()
pre_meta = preprocessor.preprocess(audio_meta.path)
pre_meta_dict = pre_meta.to_dict()
pre_meta_dict


{'path': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav',
 'original_duration_sec': 19.722,
 'processed_duration_sec': 18.415,
 'sample_rate': 16000,
 'applied_steps': ['trim_silence', 'highpass', 'normalize'],
 'notes': ''}

In [6]:
# 3. Melody extraction
extractor = MelodyExtractor()
contour = extractor.extract(pre_meta.path)
contour_dict = contour.to_dict()
print(f"Generated MIDI: {contour_dict['midi_path']}")
contour_dict['metadata']



Predicting MIDI for /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav...
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32


  Creating midi...
  ðŸ’… Saved to /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine_basic_pitch.mid
Generated MIDI: /Users/xijiecao/Desktop/course/s3

{'extractor': 'basic_pitch (onnx)',
 'source_audio': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav'}

In [7]:
# 4. Melody representation
representer = MelodyRepresenter()

rep = representer.represent(midi_path=contour.midi_path)
rep_dict = rep.to_dict()

rep_summary = {
    'notes': len(rep_dict['note_sequence']),
    'tempo_bpm': rep_dict['rhythm_profile'].get('estimated_tempo_bpm'),
}
rep_summary


{'notes': 52, 'tempo_bpm': 183.91386953768213}

In [8]:
# 5. Style selection
style_manager = StyleConfigManager()
available_styles = style_manager.list_styles()
STYLE_NAME = available_styles[1]  # pick first by default
style_config = style_manager.get_style(STYLE_NAME)
style_config.to_dict()


{'name': 'ambient',
 'description': 'Ethereal ambient soundscape with pads and evolving textures.',
 'mood': 'calm',
 'tempo_bpm': 70,
 'instruments': ['pads', 'drones', 'textures'],
 'model_configs': {'stub': {'model_name': 'stub',
   'prompt': 'A calm ambient soundscape with airy pads and evolving textures.',
   'max_duration_sec': 30}}}

In [9]:
# 6. Music generation 

generator = MusicGenerator(model_size='melody', device='cpu') 

if hasattr(style_config, 'to_dict'):
    style_data = style_config.to_dict()
else:
    style_data = style_config

prompt = style_data.get('description', f"A song in {STYLE_NAME} style")
print(f"Generating with prompt: {prompt}")

gen_result = generator.generate(
    melody_representation=rep_dict, 
    melody_audio_path=pre_meta.path, 
    style_name=STYLE_NAME,
    prompt_text=prompt,
    duration_sec=10 
)

gen_result.to_dict()

You are using a model of type musicgen_melody to instantiate a model of type musicgen. This is not supported for all configurations of models and can yield errors.
  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:11<00:00,  5.58s/it]
Some weights of MusicgenForConditionalGeneration were not initialized from the model checkpoint at facebook/musicgen-melody and are newly initialized: ['decoder.model.decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.model.decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.model.decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.model.decoder.layers.1.encoder_attn.k_proj.weight', 'deco

Generating with prompt: Ethereal ambient soundscape with pads and evolving textures.




{'model_name': 'musicgen-transformers',
 'style_name': 'ambient',
 'audio_path': 'outputs/generated/gen_ambient_20251201_232336.wav',
 'duration_sec': 10,
 'generation_metadata': {'prompt': 'Ethereal ambient soundscape with pads and evolving textures.'}}

In [10]:
# 7. Post-processing & export
postprocessor = Postprocessor()
post_result = postprocessor.process(gen_result.audio_path, style_name=STYLE_NAME, model_name=gen_result.model_name)
post_result_dict = post_result.to_dict()
post_result_dict


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'final_audio_path': '/Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/final/gen_ambient_20251201_232336_final.wav',
 'final_audio_path_mp3': None,
 'duration_sec': 29.22,
 'sample_rate': 16000,
 'postprocessing_applied': ['normalize', 'fade_in', 'fade_out'],
 'style_name': 'ambient',
 'model_name': 'musicgen-transformers'}

In [11]:
# 8. Similarity evaluation (original vs generated)

evaluator = SimilarityEvaluator()

print(f"Comparing Original: {pre_meta.path}")
print(f"     vs Generated: {post_result.final_audio_path}")

sim_report = evaluator.evaluate(
    original_processed_audio=pre_meta.path,       
    generated_audio=post_result.final_audio_path, 
    style_name=STYLE_NAME,
    model_name=gen_result.model_name,
)

sim_report.to_dict()


Comparing Original: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/data/processed/processed_input_20251201_231802_sessiondemo_sine.wav
     vs Generated: /Users/xijiecao/Desktop/course/s3/API/API_Fin-main/humming2music/outputs/final/gen_ambient_20251201_232336_final.wav


{'style_name': 'ambient',
 'model_name': 'musicgen-transformers',
 'pitch_similarity': 0.8784857799686372,
 'rhythm_similarity': 0.8784857799686372,
 'overall_similarity': 0.8784857799686372,
 'metadata': {'method': 'Audio Chroma DTW (Cosine)'}}

# Visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pretty_midi

# Create figure directory
FIGURE_DIR = PROJECT_ROOT / "latex" / "figures"
FIGURE_DIR.mkdir(parents=True, exist_ok=True)

# Load MIDI file and extract pitch information
pm = pretty_midi.PrettyMIDI(contour.midi_path)

# Extract notes from MIDI
all_notes = []
for instrument in pm.instruments:
    if not instrument.is_drum:
        for note in instrument.notes:
            all_notes.append((note.start, note.end, note.pitch))

all_notes.sort(key=lambda x: x[0])

# Create time and pitch arrays from notes
if all_notes:
    time_arr = []
    f0_midi_arr = []
    for start, end, pitch in all_notes:
        # Add points for each note
        time_arr.extend([start, end])
        f0_midi_arr.extend([pitch, pitch])
    time_arr = np.array(time_arr)
    f0_midi_arr = np.array(f0_midi_arr)
else:
    time_arr = np.array([0])
    f0_midi_arr = np.array([60])

# Plot pitch contour
fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(time_arr, f0_midi_arr, 'b-', linewidth=2, label='Detected Notes')
ax.scatter(time_arr[::2], f0_midi_arr[::2], c='blue', s=20, alpha=0.7, label='Note Onsets')

ax.set_xlabel('Time (s)', fontsize=12)
ax.set_ylabel('MIDI Note Number', fontsize=12)
ax.set_title('Extracted Pitch Contour from Input Audio (via Basic Pitch MIDI)', fontsize=14)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
if len(time_arr) > 0:
    ax.set_xlim([0, max(time_arr) + 0.5])

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'pitch_contour.pdf', dpi=300, bbox_inches='tight')
plt.savefig(FIGURE_DIR / 'pitch_contour.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved to {FIGURE_DIR / 'pitch_contour.pdf'}")

In [None]:
import librosa.display

# Load audio files
y_input, sr_input = librosa.load(pre_meta.path, sr=None)
y_output, sr_output = librosa.load(post_result.final_audio_path, sr=None)

fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=False)

# Input waveform
time_input = np.arange(len(y_input)) / sr_input
axes[0].plot(time_input, y_input, color='steelblue', linewidth=0.5)
axes[0].set_ylabel('Amplitude', fontsize=11)
axes[0].set_title('(a) Preprocessed Input Audio Waveform', fontsize=12)
axes[0].set_xlim([0, max(time_input)])
axes[0].grid(True, alpha=0.3)

# Output waveform
time_output = np.arange(len(y_output)) / sr_output
axes[1].plot(time_output, y_output, color='darkorange', linewidth=0.5)
axes[1].set_xlabel('Time (s)', fontsize=11)
axes[1].set_ylabel('Amplitude', fontsize=11)
axes[1].set_title(f'(b) Generated Output Audio Waveform (Style: {post_result.style_name})', fontsize=12)
axes[1].set_xlim([0, max(time_output)])
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'waveform_comparison.pdf', dpi=300, bbox_inches='tight')
plt.savefig(FIGURE_DIR / 'waveform_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved to {FIGURE_DIR / 'waveform_comparison.pdf'}")

In [None]:
all_styles = style_manager.list_styles()
print(f"Available styles: {all_styles}")

similarity_results = []

for style_name in all_styles:
    print(f"\n{'='*50}")
    print(f"Processing style: {style_name}")
    
    # Generate
    gen_res = generator.generate(melody_representation=rep_dict, style_name=style_name)
    print(f"  Generated: {gen_res.audio_path}")
    
    # Post-process
    post_res = postprocessor.process(gen_res.audio_path, style_name=style_name, model_name=gen_res.model_name)
    print(f"  Post-processed: {post_res.final_audio_path}")
    
    # Evaluate similarity
    sim_rep = evaluator.evaluate(
        original_processed_audio=pre_meta.path,
        generated_audio=post_res.final_audio_path,
        style_name=style_name,
        model_name=gen_res.model_name,
    )
    
    similarity_results.append({
        'style': style_name,
        'pitch_sim': sim_rep.pitch_similarity,
        'rhythm_sim': sim_rep.rhythm_similarity,
        'overall': sim_rep.overall_similarity,
    })
    print(f"  Pitch Similarity: {sim_rep.pitch_similarity:.3f}")
    print(f"  Rhythm Similarity: {sim_rep.rhythm_similarity:.3f}")
    print(f"  Overall Similarity: {sim_rep.overall_similarity:.3f}")

print(f"\n{'='*50}")
print("All styles processed!")

In [None]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame(similarity_results)
df.columns = ['Style', 'Pitch Similarity', 'Rhythm Similarity', 'Overall Score']

# Capitalize style names for display
df['Style'] = df['Style'].str.capitalize()

print("=" * 60)
print("SIMILARITY EVALUATION RESULTS")
print("=" * 60)
print(df.to_string(index=False))

# Generate LaTeX table code
print("\n" + "=" * 60)
print("LATEX TABLE CODE (copy this to main.tex)")
print("=" * 60)

latex_rows = []
for _, row in df.iterrows():
    style = row['Style']
    if style == '8bit':
        style = '8-bit'
    latex_rows.append(f"{style} & {row['Pitch Similarity']:.2f} & {row['Rhythm Similarity']:.2f} & {row['Overall Score']:.2f} \\\\")

print("""\\begin{table}[H]
\\centering
\\caption{Similarity evaluation results across musical styles (stub generator baseline)}
\\label{tab:similarity}
\\begin{tabular}{lccc}
\\toprule
Style & Pitch Similarity & Rhythm Similarity & Overall Score \\\\
\\midrule""")
for row in latex_rows:
    print(row)
print("""\\bottomrule
\\end{tabular}
\\end{table}""")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (a) Input waveform
ax = axes[0, 0]
y_in, sr_in = librosa.load(pre_meta.path, sr=None)
t_in = np.arange(len(y_in)) / sr_in
ax.plot(t_in, y_in, color='steelblue', linewidth=0.5)
ax.set_xlabel('Time (s)')
ax.set_ylabel('Amplitude')
ax.set_title('(a) Preprocessed Input Waveform')
ax.set_xlim([0, max(t_in)])
ax.grid(True, alpha=0.3)

# (b) Pitch contour from MIDI
ax = axes[0, 1]
pm_viz = pretty_midi.PrettyMIDI(contour.midi_path)
viz_notes = []
for inst in pm_viz.instruments:
    if not inst.is_drum:
        for note in inst.notes:
            viz_notes.append((note.start, note.end, note.pitch))
viz_notes.sort(key=lambda x: x[0])

if viz_notes:
    for start, end, pitch in viz_notes:
        ax.plot([start, end], [pitch, pitch], 'b-', linewidth=3, alpha=0.7)
        ax.scatter([start], [pitch], c='blue', s=30, zorder=5)
    max_time = max(n[1] for n in viz_notes)
else:
    max_time = 1.0
ax.set_xlabel('Time (s)')
ax.set_ylabel('MIDI Note Number')
ax.set_title('(b) Extracted Pitch Contour (from MIDI)')
ax.set_xlim([0, max_time + 0.5])
ax.grid(True, alpha=0.3)

# (c) Note segmentation
ax = axes[1, 0]
notes = rep.note_sequence
colors = plt.cm.Set2(np.linspace(0, 1, len(notes) if notes else 1))
for i, note in enumerate(notes):
    ax.barh(note.pitch_midi, note.duration, left=note.start, 
            height=0.8, color=colors[i % len(colors)], edgecolor='black', linewidth=0.5)
ax.set_xlabel('Time (s)')
ax.set_ylabel('MIDI Note Number')
ax.set_title(f'(c) Note Segmentation ({len(notes)} notes)')
if notes:
    ax.set_xlim([0, max(n.start + n.duration for n in notes) + 0.5])
ax.grid(True, alpha=0.3, axis='x')

# (d) Generated output waveform
ax = axes[1, 1]
y_out, sr_out = librosa.load(post_result.final_audio_path, sr=None)
t_out = np.arange(len(y_out)) / sr_out
ax.plot(t_out, y_out, color='darkorange', linewidth=0.5)
ax.set_xlabel('Time (s)')
ax.set_ylabel('Amplitude')
ax.set_title(f'(d) Generated Output Waveform (Style: {post_result.style_name})')
ax.set_xlim([0, max(t_out)])
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'pipeline_demo.pdf', dpi=300, bbox_inches='tight')
plt.savefig(FIGURE_DIR / 'pipeline_demo.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Saved to {FIGURE_DIR / 'pipeline_demo.pdf'}")