In [None]:
import librosa
import librosa.display
import numpy as np
import json
import os
import tensorflow as tf
import matplotlib.pyplot as plt

# โหลด Metadata
with open("C:/Users/M S I/Desktop/Autopitch/dataset/nsynth-train/examples.json", "r") as f:
    metadata = json.load(f)

# ฟังก์ชันโหลดเสียงและแปลงเป็น Mel-Spectrogram
def load_audio(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalization
    mel_spec_db = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    return mel_spec_db, sr

# โหลดตัวอย่างเสียง
file_path = "C:/Users/M S I/Desktop/Autopitch/dataset/nsynth-train/audio/vocal_synthetic_015-095-075.wav"
mel_spec, sr = load_audio(file_path)

# แสดง Mel-Spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel_spec, sr=sr, x_axis="time", y_axis="mel")
plt.colorbar(label="dB")
plt.title("Mel Spectrogram")
plt.show()


In [None]:
# ฟังก์ชันสำหรับสร้าง dataset
def data_generator():
    for key in metadata.keys():
        file_path = os.path.join("C:/Users/M S I/Desktop/Autopitch/dataset/nsynth-train/audio", metadata[key]["audio_path"])
        mel_spec, _ = load_audio(file_path)
        pitch = metadata[key]["pitch"]
        yield mel_spec, pitch

# แปลงเป็น TensorFlow Dataset
dataset = tf.data.Dataset.from_generator(
    data_generator, 
    output_signature=(
        tf.TensorSpec(shape=(128, None), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )
)

# แปลงให้เป็น Batch
batch_size = 32
dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [None]:
from tensorflow import keras
from keras import layers

# กำหนด Input Shape (128, Time)
input_shape = (128, None, 1)

# สร้างโมเดล CNN
model = keras.Sequential([
    layers.Conv2D(32, (3,3), activation="relu", padding="same", input_shape=input_shape),
    layers.MaxPooling2D((2,2)),
    
    layers.Conv2D(64, (3,3), activation="relu", padding="same"),
    layers.MaxPooling2D((2,2)),

    layers.Conv2D(128, (3,3), activation="relu", padding="same"),
    layers.MaxPooling2D((2,2)),

    layers.Flatten(),
    layers.Dense(256, activation="relu"),
    layers.Dense(88, activation="softmax")  # 88 keys (piano range)
])

# คอมไพล์โมเดล
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# แสดงโครงสร้างโมเดล
model.summary()


In [None]:
# ฝึกโมเดลด้วย dataset ที่สร้างไว้
model.fit(dataset, epochs=10)


In [None]:
# โหลดตัวอย่างเสียงใหม่
test_file = "C:/Users/M S I/Desktop/Autopitch/dataset/nsynth-train/audio/vocal_synthetic_020-080-060.wav"
test_mel, _ = load_audio(test_file)

# แปลงเป็น Tensor
test_mel = np.expand_dims(test_mel, axis=(0, -1))  # Reshape เป็น (1, 128, Time, 1)

# ทำนายโน้ต
prediction = model.predict(test_mel)
predicted_pitch = np.argmax(prediction)

print(f"Predicted Pitch: {predicted_pitch}")
