In [7]:
import json
import os
import pandas as pd
import librosa
import numpy as np
from IPython.display import Audio

In [8]:
base_dir = "nsynth-train"
json_path = os.path.join(base_dir, "examples.json")
audio_dir = os.path.join(base_dir, "audio")

# Load the whole file as one big JSON object
with open(json_path, "r") as f:
    data_dict = json.load(f)

data = []
for note_str, features in data_dict.items():
    features["note_str"] = note_str
    data.append(features)
df = pd.DataFrame(data)
df["audio_path"] = df["note_str"].apply(lambda x: os.path.join(audio_dir, x + ".wav"))

print("âœ… Loaded", len(df), "examples")


âœ… Loaded 289205 examples


In [9]:
df['instrument_family_str'].unique()

array(['guitar', 'bass', 'organ', 'keyboard', 'vocal', 'string', 'reed',
       'flute', 'mallet', 'brass', 'synth_lead'], dtype=object)

In [10]:
df['instrument_family_str'].value_counts()

instrument_family_str
bass          65474
keyboard      51821
organ         34477
mallet        34201
guitar        32690
string        19474
reed          13911
brass         12675
vocal         10208
flute          8773
synth_lead     5501
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,note,sample_rate,pitch,instrument_source,instrument_family_str,instrument_str,note_str,qualities_str,instrument_source_str,velocity,instrument_family,instrument,qualities,audio_path
0,16629,16000,82,0,guitar,guitar_acoustic_001,guitar_acoustic_001-082-050,[percussive],acoustic,50,3,39,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",nsynth-train/audio/guitar_acoustic_001-082-050...
1,168243,16000,108,2,bass,bass_synthetic_120,bass_synthetic_120-108-050,[percussive],synthetic,50,0,881,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",nsynth-train/audio/bass_synthetic_120-108-050.wav
2,88495,16000,50,1,organ,organ_electronic_120,organ_electronic_120-050-127,[],electronic,127,6,979,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",nsynth-train/audio/organ_electronic_120-050-12...
3,146382,16000,26,1,guitar,guitar_electronic_021,guitar_electronic_021-026-025,"[fast_decay, percussive]",electronic,25,3,346,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0]",nsynth-train/audio/guitar_electronic_021-026-0...
4,244275,16000,79,1,organ,organ_electronic_011,organ_electronic_011-079-075,[bright],electronic,75,6,188,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",nsynth-train/audio/organ_electronic_011-079-07...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289200,190815,16000,39,1,organ,organ_electronic_083,organ_electronic_083-039-100,[dark],electronic,100,6,808,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",nsynth-train/audio/organ_electronic_083-039-10...
289201,81228,16000,53,1,bass,bass_electronic_024,bass_electronic_024-053-025,[distortion],electronic,25,0,847,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",nsynth-train/audio/bass_electronic_024-053-025...
289202,20732,16000,58,0,string,string_acoustic_082,string_acoustic_082-058-100,[reverb],acoustic,100,8,699,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",nsynth-train/audio/string_acoustic_082-058-100...
289203,28856,16000,56,0,reed,reed_acoustic_020,reed_acoustic_020-056-100,[],acoustic,100,7,200,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",nsynth-train/audio/reed_acoustic_020-056-100.wav


In [12]:
subset = df[df["instrument_family_str"].isin(["guitar", "keyboard"])].copy()
subset = subset[subset["instrument_source_str"].isin(["acoustic"])].copy()
match_cols = ["pitch", "velocity", "sample_rate"]

grouped = subset.groupby(match_cols)

pairs = []

for key, group in grouped:
    guitars = group[group["instrument_family_str"] == "guitar"]
    keyboards = group[group["instrument_family_str"] == "keyboard"]
    
    if len(guitars) > 0 and len(keyboards) > 0:
        for _, g_row in guitars.iterrows():
            for _, k_row in keyboards.iterrows():
                pairs.append({
                    **{col: g_row[col] for col in match_cols},  # common columns
                    "guitar_audio": g_row["audio_path"],
                    "keyboard_audio": k_row["audio_path"],
                    "guitar_inst": g_row["instrument_str"],
                    "keyboard_inst": k_row["instrument_str"],
                    "guitar_note": g_row["note_str"],
                    "keyboard_note": k_row["note_str"],
                    "guitar_source": g_row["instrument_source_str"],
                    "keyboard_source": k_row["instrument_source_str"],
                    "guitar_qualities": g_row["qualities_str"],
                    "keyboard_qualities": k_row["qualities_str"],
                })

pairs_df = pd.DataFrame(pairs)
print(f"âœ… Created {len(pairs_df)} guitarâ€“keyboard pairs")
pairs_df.head()

âœ… Created 209750 guitarâ€“keyboard pairs


Unnamed: 0,pitch,velocity,sample_rate,guitar_audio,keyboard_audio,guitar_inst,keyboard_inst,guitar_note,keyboard_note,guitar_source,keyboard_source,guitar_qualities,keyboard_qualities
0,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_014-021-0...,guitar_acoustic_009,keyboard_acoustic_014,guitar_acoustic_009-021-025,keyboard_acoustic_014-021-025,acoustic,acoustic,"[dark, percussive]",[]
1,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_005-021-0...,guitar_acoustic_009,keyboard_acoustic_005,guitar_acoustic_009-021-025,keyboard_acoustic_005-021-025,acoustic,acoustic,"[dark, percussive]","[long_release, reverb]"
2,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_000-021-0...,guitar_acoustic_009,keyboard_acoustic_000,guitar_acoustic_009-021-025,keyboard_acoustic_000-021-025,acoustic,acoustic,"[dark, percussive]",[reverb]
3,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_002-021-0...,guitar_acoustic_009,keyboard_acoustic_002,guitar_acoustic_009-021-025,keyboard_acoustic_002-021-025,acoustic,acoustic,"[dark, percussive]","[dark, reverb]"
4,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_019-021-0...,guitar_acoustic_009,keyboard_acoustic_019,guitar_acoustic_009-021-025,keyboard_acoustic_019-021-025,acoustic,acoustic,"[dark, percussive]","[dark, long_release, reverb]"


In [13]:
pairs_df

Unnamed: 0,pitch,velocity,sample_rate,guitar_audio,keyboard_audio,guitar_inst,keyboard_inst,guitar_note,keyboard_note,guitar_source,keyboard_source,guitar_qualities,keyboard_qualities
0,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_014-021-0...,guitar_acoustic_009,keyboard_acoustic_014,guitar_acoustic_009-021-025,keyboard_acoustic_014-021-025,acoustic,acoustic,"[dark, percussive]",[]
1,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_005-021-0...,guitar_acoustic_009,keyboard_acoustic_005,guitar_acoustic_009-021-025,keyboard_acoustic_005-021-025,acoustic,acoustic,"[dark, percussive]","[long_release, reverb]"
2,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_000-021-0...,guitar_acoustic_009,keyboard_acoustic_000,guitar_acoustic_009-021-025,keyboard_acoustic_000-021-025,acoustic,acoustic,"[dark, percussive]",[reverb]
3,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_002-021-0...,guitar_acoustic_009,keyboard_acoustic_002,guitar_acoustic_009-021-025,keyboard_acoustic_002-021-025,acoustic,acoustic,"[dark, percussive]","[dark, reverb]"
4,21,25,16000,nsynth-train/audio/guitar_acoustic_009-021-025...,nsynth-train/audio/keyboard_acoustic_019-021-0...,guitar_acoustic_009,keyboard_acoustic_019,guitar_acoustic_009-021-025,keyboard_acoustic_019-021-025,acoustic,acoustic,"[dark, percussive]","[dark, long_release, reverb]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209745,108,127,16000,nsynth-train/audio/guitar_acoustic_007-108-127...,nsynth-train/audio/keyboard_acoustic_006-108-1...,guitar_acoustic_007,keyboard_acoustic_006,guitar_acoustic_007-108-127,keyboard_acoustic_006-108-127,acoustic,acoustic,"[fast_decay, percussive]",[reverb]
209746,108,127,16000,nsynth-train/audio/guitar_acoustic_007-108-127...,nsynth-train/audio/keyboard_acoustic_007-108-1...,guitar_acoustic_007,keyboard_acoustic_007,guitar_acoustic_007-108-127,keyboard_acoustic_007-108-127,acoustic,acoustic,"[fast_decay, percussive]","[fast_decay, percussive]"
209747,108,127,16000,nsynth-train/audio/guitar_acoustic_007-108-127...,nsynth-train/audio/keyboard_acoustic_014-108-1...,guitar_acoustic_007,keyboard_acoustic_014,guitar_acoustic_007-108-127,keyboard_acoustic_014-108-127,acoustic,acoustic,"[fast_decay, percussive]",[percussive]
209748,108,127,16000,nsynth-train/audio/guitar_acoustic_007-108-127...,nsynth-train/audio/keyboard_acoustic_001-108-1...,guitar_acoustic_007,keyboard_acoustic_001,guitar_acoustic_007-108-127,keyboard_acoustic_001-108-127,acoustic,acoustic,"[fast_decay, percussive]","[fast_decay, percussive]"


In [14]:
#listening to audio
sample = pairs_df.iloc[150000]

# Play the guitar audio
print("ðŸŽ¸ Guitar:", sample["guitar_audio"])
Audio(filename=sample["guitar_audio"], rate=16000)


ðŸŽ¸ Guitar: nsynth-train/audio/guitar_acoustic_018-080-025.wav


In [15]:
print("ðŸŽ¹ Keyboard:", sample["keyboard_audio"])
Audio(filename=sample["keyboard_audio"], rate=16000)


ðŸŽ¹ Keyboard: nsynth-train/audio/keyboard_acoustic_017-080-025.wav


In [16]:
import sys
sys.executable

'/projectnb/ec523bn/projects/music_project/project/projenv/bin/python'

In [22]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn as nn
import librosa

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader



class GuitarKeyboardDataset(Dataset):
    def __init__(self, df, sr=16000, n_mels=128, hop_length=256, duration=2.0):
        """
        df: DataFrame with columns ['guitar_audio', 'keyboard_audio']
        sr: sample rate
        n_mels: number of mel bins
        hop_length: hop length for STFT
        duration: duration in seconds to fix length of audio/mel
        """
        self.df = df.reset_index(drop=True)
        self.sr = sr
        self.n_mels = n_mels
        self.hop_length = hop_length
        self.num_samples = int(duration * sr)  # fixed number of samples

    def audio_to_mel(self, path):
        # Load audio
        y, _ = librosa.load(path, sr=self.sr)
        # Truncate/pad to fixed length
        if len(y) > self.num_samples:
            y = y[:self.num_samples]
        elif len(y) < self.num_samples:
            y = np.pad(y, (0, self.num_samples - len(y)))
        # Convert to mel
        mel = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels, hop_length=self.hop_length)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        return mel_db.astype(np.float32)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        g_mel = self.audio_to_mel(row["guitar_audio"])
        k_mel = self.audio_to_mel(row["keyboard_audio"])
        # Add channel dimension for CNNs: (C=1, n_mels, time)
        g_mel = torch.tensor(g_mel).unsqueeze(0)
        k_mel = torch.tensor(k_mel).unsqueeze(0)
        return g_mel, k_mel

    def __len__(self):
        return len(self.df)


In [29]:
dataset = GuitarKeyboardDataset(pairs_df, duration=2.0)  # 2-second clips
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Test
for g, k in loader:
    print("Guitar batch:", g.shape)
    print("Keyboard batch:", k.shape)
    break


Guitar batch: torch.Size([8, 1, 128, 126])
Keyboard batch: torch.Size([8, 1, 128, 126])


In [None]:
class RectifiedFlowModel(nn.Module):
    def __init__(self, n_mels=128):
        super().__init__()
        # Simple CNN example
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 1, 3, padding=1),
        )
        
    def forward(self, x, t=None):
        # x: (batch, 1, n_mels, time)
        # t: optional time/conditioning input for flow
        return self.conv(x)

# Instantiate model and optimizer
model = RectifiedFlowModel(n_mels=128).to(device)
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()
device = 'cuda'

# Training loop
epochs = 10

for epoch in range(epochs):
    running_loss = 0.0
    for g, k in loader:  # g: guitar, k: keyboard
        #print("here")
        g = g.to(device)
        k = k.to(device)
        
        # Linear interpolation for rectified flow step
        t = torch.rand(g.size(0), 1, 1, 1, device=device)
        x_t = (1 - t) * g + t * k
        target_v = (k - g)  # velocity/gradient target

        # Forward
        v_pred = model(x_t, t.squeeze(1) if t is not None else None)

        # Compute loss
        # Make sure shapes match exactly
        if v_pred.shape != target_v.shape:
            min_time = min(v_pred.shape[-1], target_v.shape[-1])
            v_pred = v_pred[..., :min_time]
            target_v = target_v[..., :min_time]
        
        loss = loss_fn(v_pred, target_v)

        # Backprop
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(loader):.6f}")


Epoch 1/10, Loss: 337.858746


In [None]:
def translate_guitar_to_keyboard(model, g, steps=32):
    x = g.unsqueeze(0).unsqueeze(1).to(device)
    t_vals = torch.linspace(0, 1, steps, device=device)
    dt = 1 / steps
    with torch.no_grad():
        for t in t_vals:
            v = model(x, t)
            x = x + v * dt  # Euler integration
    return x.squeeze().cpu().numpy()


In [None]:
pred_mel = translate_guitar_to_keyboard(model, torch.tensor(g_mel))
pred_audio = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(pred_mel))
Audio(pred_audio, rate=16000)
