In [2]:
using Pkg

Pkg.add("WAV")
Pkg.add("DSP")

using Flux
using Flux: onehotbatch, onecold, crossentropy, @epochs
using Statistics: mean
using WAV         # For loading WAV files
using DSP         # For computing spectrograms



[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m WAV ─ v1.2.0
[32m[1m    Updating[22m[39m `~/.julia/environments/v1.11/Project.toml`
  [90m[8149f6b0] [39m[92m+ WAV v1.2.0[39m
[32m[1m    Updating[22m[39m `~/.julia/environments/v1.11/Manifest.toml`
  [90m[8149f6b0] [39m[92m+ WAV v1.2.0[39m
[92m[1mPrecompiling[22m[39m project...
   6077.1 ms[32m  ✓ [39mWAV
   8249.8 ms[32m  ✓ [39mPlots → UnitfulExt
         [91m  ✗ [39mCUDA
         [91m  ✗ [39mCUDA → EnzymeCoreExt
         [91m  ✗ [39m[90mAtomix → AtomixCUDAExt[39m
         [91m  ✗ [39m[90mArrayInterface → ArrayInterfaceCUDAExt[39m
         [91m  ✗ [39m[90mStridedViews → StridedViewsCUDAExt[39m
         [91m  ✗ [39m[90mNNlib → NNlibCUDAExt[39m
         [91m  ✗ [39mCUDA → SpecialFunctionsExt
         [91m  ✗ [39m[90mLinearSolve → LinearSolveCUDAExt[39m
         [91m  ✗ [39mCUDA → ChainRulesCoreExt


In [None]:
using Downloads, ZipFile

url = "https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip"
zip_file = "fsdd.zip"
Downloads.download(url, zip_file)
ZipFile.extract(zip_file, "fsdd")

In [None]:
# --- Helper function to load audio and compute a spectrogram ---
function load_spectrogram(filepath; nfft=1024, hop=512)
    # Load audio; WAV.wavread returns (signal, sample_rate)
    y, fs = wavread(filepath)
    # If stereo, average the channels to mono
    if ndims(y) == 2
        y = mean(y, dims=2)
        y = vec(y)
    end
    # Compute short-time Fourier transform (STFT)
    S = stft(y, nfft=nfft, hop=hop, window=hanning(nfft))
    # Take magnitude (you might also want to convert to a Mel scale)
    spectro = abs.(S)
    # Optionally: resize or normalize the spectrogram here
    return spectro
end



In [None]:
# --- CNN Model Definition ---
# For example, if you convert each audio file into a fixed-size spectrogram,
# say 128×128 with 1 channel (grayscale), then set the following dimensions:
const IMG_H = 128
const IMG_W = 128
const CHANNELS = 1      # e.g., grayscale spectrogram

# For FSDD, there are 10 classes (digits 0-9)
const num_classes = 10

model = Chain(
    # Input: (IMG_H, IMG_W, CHANNELS)
    Conv((3, 3), CHANNELS => 16, relu; pad=(1,1)),
    MaxPool((2, 2)),
    Conv((3, 3), 16 => 32, relu; pad=(1,1)),
    MaxPool((2, 2)),
    Conv((3, 3), 32 => 64, relu; pad=(1,1)),
    MaxPool((2, 2)),
    flatten,
    Dense((IMG_H ÷ 8) * (IMG_W ÷ 8) * 64, 128, relu),
    Dense(128, num_classes),
    softmax
)



In [None]:
# --- Dummy Data for Demonstration ---
# Replace this with your actual data loader that:
#   - Iterates over FSDD audio files,
#   - Computes their spectrogram using load_spectrogram,
#   - Resizes/normalizes to (128, 128, 1),
#   - And converts labels (digits) into one-hot vectors.
dummy_input = rand(Float32, IMG_H, IMG_W, CHANNELS, 16)  # 16 dummy examples
dummy_labels = onehotbatch(rand(1:num_classes, 16), 1:num_classes)

# Forward pass example:
output = model(dummy_input)
println("Model output size: ", size(output))

# --- Training Loop Skeleton ---
loss(x, y) = crossentropy(model(x), y)
opt = ADAM()

@epochs 5 for (x, y) in [(dummy_input, dummy_labels)]  # Replace with your actual data iterator
    grads = gradient(() -> loss(x, y), Flux.params(model))
    Flux.Optimise.update!(opt, Flux.params(model), grads)
end

println("Training complete.")
