In [5]:
import numpy as np
import librosa
import soundfile as sf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Input

# Load mono audio file
mono_audio_path = "../data/mono_audio_16k_short.wav"  
audio, sr = librosa.load(mono_audio_path, sr=None, mono=True)

# Normalize audio
audio = audio / np.max(np.abs(audio))

# Prepare input data for the model
# Reshape audio to match input dimensions (batch_size, time_steps, features)
audio_input = audio.reshape(1, -1, 1)

# Prepare input data for the model - process in smaller chunks
CHUNK_SIZE = 8192  # Process audio in smaller segments

# Define a simple generative model for upmixing
model = Sequential([
    Input(shape=(audio_input.shape[1], 1)),  
    Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    Flatten(),
    Dense(audio_input.shape[1], activation='tanh'),  
])

# Generate stereo audio using the model
# For simplicity, we use random weights (no training)
model.compile(optimizer='adam', loss='mse')
# stereo_output = model.predict(audio_input)

# Process audio in chunks
stereo_chunks = []
for i in range(0, len(audio), CHUNK_SIZE):
    chunk = audio[i:i+CHUNK_SIZE]
    chunk_input = chunk.reshape(1, -1, 1)
    
    # Generate stereo for this chunk
    chunk_output = model.predict(chunk_input)
    stereo_chunks.append(chunk_output[0])
    
    # Optional: Print progress
    print(f"Processed chunk {i//CHUNK_SIZE + 1}/{(len(audio) + CHUNK_SIZE - 1)//CHUNK_SIZE}")

# Combine all chunks
stereo_output = np.concatenate(stereo_chunks, axis=0)

# Reshape the output to (num_samples, 2) for stereo
stereo_output = stereo_output.reshape(-1, 2)

# Extract left and right channels
left_channel = stereo_output[:, 0]
right_channel = stereo_output[:, 1]

# Normalize channels to avoid clipping
left_channel = left_channel / np.max(np.abs(left_channel))
right_channel = right_channel / np.max(np.abs(right_channel))

# Combine channels into stereo
stereo_audio = np.vstack((left_channel, right_channel)).T

# Save the stereo audio file
stereo_audio_path = "../output/stereo_audio_CNN_generated.wav" 
sf.write(stereo_audio_path, stereo_audio, sr)

print(f"Generative stereo audio saved to {stereo_audio_path}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
Processed chunk 1/1
Generative stereo audio saved to ../output/stereo_audio_CNN_generated.wav
