In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple autoregressive model
class SimpleARModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleARModel, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Predict the next sample
        return out

# Generate synthetic training data (e.g., a sine wave)
def generate_sine_wave(frequency, sample_rate, duration):
    t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
    return np.sin(2 * np.pi * frequency * t)

# Training parameters
sample_rate = 16000
duration = 1.0  # 1 second
frequency = 440.0  
hidden_size = 32
learning_rate = 0.001
epochs = 100

# Generate training data
audio_data = generate_sine_wave(frequency, sample_rate, duration)
input_data = torch.tensor(audio_data[:-1], dtype=torch.float32).view(1, -1, 1)
target_data = torch.tensor(audio_data[1:], dtype=torch.float32).view(1, -1, 1)

# Initialize the model, loss function, and optimizer
model = SimpleARModel(input_size=1, hidden_size=hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(input_data)
    loss = criterion(output, target_data[:, -1, :])
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# Generate new audio samples
model.eval()
generated_audio = [audio_data[0]]  # Start with the first sample
input_seq = torch.tensor([[generated_audio[-1]]], dtype=torch.float32).view(1, 1, 1)

for _ in range(sample_rate):  # Generate 1 second of audio
    with torch.no_grad():
        next_sample = model(input_seq)
        generated_audio.append(next_sample.item())
        input_seq = torch.tensor([[next_sample.item()]], dtype=torch.float32).view(1, 1, 1)

# Save the generated audio as a .wav file
from scipy.io.wavfile import write
write("../data/generated_audio_AUTOREG.wav", sample_rate, np.array(generated_audio, dtype=np.float32))

print("Audio generation complete. Saved as 'generated_audio.wav'.")

Epoch [10/100], Loss: 0.0000
Epoch [20/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Audio generation complete. Saved as 'generated_audio.wav'.
