# Binary Files for Audio in Python (WAV, PCM) — Read/Write/Process

In this section you will learn:
- What PCM/WAV audio is (sample rate, channels, sample width)
- Reading and writing binary audio with the standard libraries (`wave`, `numpy`, `scipy`)
- Chunked reading, RMS power, normalization, downmixing (stereo→mono)
- Basic resampling and silence trimming
- Optional: handling non-WAV formats (MP3/FLAC) with `soundfile`, `librosa`, `pydub`

**Key ideas**
- WAV is a container; for uncompressed PCM, samples are typically 16-bit **signed** little-endian (`s16le`).
- **Sample rate** (e.g., 44_100 Hz) = samples per second, per channel.
- **Channels**: 1 (mono), 2 (stereo), etc.
- **Sample width**: bytes per sample (2 bytes for 16-bit).

We’ll generate a small test tone so you can run everything anywhere.


In [4]:
# 0) Imports & output paths (no audioop here)
from pathlib import Path
import math, struct, contextlib
import wave
import numpy as np

# Optional: high-quality resampling
try:
    from scipy.signal import resample_poly
    HAVE_SCIPY = True
except Exception as _:
    HAVE_SCIPY = False

DATA_DIR = Path("data_audio")
DATA_DIR.mkdir(exist_ok=True)
TEST_WAV = DATA_DIR / "test_tone_440Hz_mono_16bit_44k.wav"


In [5]:
HAVE_SCIPY

True

## 1) Generate a 1-second 440 Hz test tone (PCM 16-bit, mono, 44.1 kHz)

In [6]:
def write_sine_wav(path: Path, freq=440.0, duration_sec=1.0, sr=44_100, amp=0.5):
    """Write a mono 16-bit PCM WAV sine wave."""
    n_frames = int(sr * duration_sec)
    max_int16 = 32767
    with wave.open(str(path), "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)      # 2 bytes = 16-bit
        wf.setframerate(sr)
        frames = bytearray()
        for n in range(n_frames):
            sample = amp * math.sin(2 * math.pi * freq * (n / sr))
            s = int(max(-1.0, min(1.0, sample)) * max_int16)
            frames += struct.pack("<h", s)  # little-endian int16
        wf.writeframes(frames)

write_sine_wav(TEST_WAV)
TEST_WAV, TEST_WAV.exists(), TEST_WAV.stat().st_size


(PosixPath('data_audio/test_tone_440Hz_mono_16bit_44k.wav'), True, 88244)

## 2) Inspect WAV metadata

In [7]:
def wav_info(path: Path):
    with contextlib.closing(wave.open(str(path), "rb")) as wf:
        ch = wf.getnchannels()
        sw = wf.getsampwidth()
        sr = wf.getframerate()
        n  = wf.getnframes()
        dur = n / float(sr)
    return dict(channels=ch, sampwidth=sw, samplerate=sr, frames=n, duration=dur)

wav_info(TEST_WAV)


{'channels': 1,
 'sampwidth': 2,
 'samplerate': 44100,
 'frames': 44100,
 'duration': 1.0}

## 3) Read WAV into NumPy and write back

We’ll support **16-bit PCM** (most common).  
The array shape will be `(frames, channels)` with dtype `int16`.


In [8]:
def read_wav_np(path: Path) -> tuple[np.ndarray, int]:
    """Return (audio_int16, samplerate). Shape: (frames, channels)."""
    with contextlib.closing(wave.open(str(path), "rb")) as wf:
        ch = wf.getnchannels()
        sw = wf.getsampwidth()
        sr = wf.getframerate()
        n  = wf.getnframes()
        assert sw == 2, f"Only 16-bit WAV supported here; got sampwidth={sw}"
        raw = wf.readframes(n)
    x = np.frombuffer(raw, dtype=np.int16)
    if ch > 1:
        x = x.reshape(-1, ch)
    else:
        x = x.reshape(-1, 1)
    return x.copy(), sr  # copy so it's writable

def write_wav_np(path: Path, x: np.ndarray, sr: int):
    """
    Write int16 array to WAV. Accepts shape (frames,) or (frames, channels).
    If float array in [-1,1], it will be scaled to int16.
    """
    x = np.asarray(x)
    if x.ndim == 1:
        x = x[:, None]
    if x.dtype.kind == "f":
        x = np.clip(x, -1.0, 1.0)
        x = (x * 32767.0).astype(np.int16)
    elif x.dtype != np.int16:
        x = x.astype(np.int16)

    ch = x.shape[1]
    with wave.open(str(path), "wb") as wf:
        wf.setnchannels(ch)
        wf.setsampwidth(2)
        wf.setframerate(sr)
        wf.writeframes(x.tobytes())

x_mono, sr = read_wav_np(TEST_WAV)
x_mono.shape, sr, x_mono.dtype


((44100, 1), 44100, dtype('int16'))

## 4) RMS power (overall and per-channel)


In [9]:
def rms_np(x: np.ndarray) -> float:
    """Overall RMS of int16/float array (all channels)."""
    x_f = x.astype(np.float64)
    return float(np.sqrt(np.mean(x_f**2)))

def rms_per_channel(x: np.ndarray) -> np.ndarray:
    """RMS per channel. Shape (channels,)."""
    x_f = x.astype(np.float64)
    return np.sqrt(np.mean(x_f**2, axis=0))

rms_overall = rms_np(x_mono)
rms_channels = rms_per_channel(x_mono)
rms_overall, rms_channels


(11584.434884910057, array([11584.43488491]))

## 5) Peak normalization (no clipping)

Scale so the maximum absolute sample reaches a target (e.g., 0.95 of full-scale).


In [10]:
def normalize_peak_int16(x: np.ndarray, target: float = 0.95) -> np.ndarray:
    """
    Peak-normalize an int16/float array to target*32767.
    Returns int16 array.
    """
    x_f = x.astype(np.float64)
    peak = np.max(np.abs(x_f))
    if peak == 0:
        return x.astype(np.int16)
    scale = (target * 32767.0) / peak
    y = x_f * scale
    y = np.clip(y, -32768, 32767)
    return y.astype(np.int16)

x_norm = normalize_peak_int16(x_mono, target=0.95)
out_norm = DATA_DIR / "test_tone_normalized_np.wav"
write_wav_np(out_norm, x_norm, sr)
wav_info(out_norm), rms_np(x_norm)


({'channels': 1,
  'sampwidth': 2,
  'samplerate': 44100,
  'frames': 44100,
  'duration': 1.0},
 22010.65738641881)

## 6) Stereo/Mono conversions (downmix & upmix)

In [11]:
def to_stereo_from_mono(x_mono: np.ndarray) -> np.ndarray:
    """Duplicate mono channel to stereo."""
    if x_mono.ndim == 1:
        x_mono = x_mono[:, None]
    assert x_mono.shape[1] == 1, "Input must be mono"
    return np.repeat(x_mono, 2, axis=1)

def downmix_stereo_to_mono(x_stereo: np.ndarray, method: str = "avg") -> np.ndarray:
    """Downmix stereo to mono by averaging or summing (avg is safe)."""
    assert x_stereo.shape[1] == 2, "Input must be stereo"
    if method == "sum":
        y = x_stereo.sum(axis=1, dtype=np.int64)  # avoid overflow
        # clip to int16 range
        y = np.clip(y, -32768, 32767).astype(np.int16)
        return y[:, None]
    # default: average
    y = x_stereo.astype(np.int32).mean(axis=1)
    return y.astype(np.int16)[:, None]

# Make a stereo file from mono (duplicate channels)
x_stereo = to_stereo_from_mono(x_mono)
stereo_wav = DATA_DIR / "test_tone_stereo_np.wav"
write_wav_np(stereo_wav, x_stereo, sr)

# Downmix back to mono
x_mono2 = downmix_stereo_to_mono(x_stereo, method="avg")
downmix_wav = DATA_DIR / "test_tone_downmixed_mono_np.wav"
write_wav_np(downmix_wav, x_mono2, sr)
wav_info(stereo_wav), wav_info(downmix_wav)


({'channels': 2,
  'sampwidth': 2,
  'samplerate': 44100,
  'frames': 44100,
  'duration': 1.0},
 {'channels': 1,
  'sampwidth': 2,
  'samplerate': 44100,
  'frames': 44100,
  'duration': 1.0})

## 7) Resampling with `scipy.signal.resample_poly` (quality & speed)

If `scipy` is unavailable, show a message to install it.


In [12]:
def resample_np(x: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
    """
    Resample (frames, channels) from src_sr to dst_sr using polyphase filtering.
    Requires scipy. Returns int16 array.
    """
    if not HAVE_SCIPY:
        raise RuntimeError("scipy not available. Install with `pip install scipy`.")
    # Work in float to avoid quantization during filtering
    x_f = x.astype(np.float64)
    # Compute up/down factors
    from math import gcd
    g = gcd(src_sr, dst_sr)
    up, down = dst_sr // g, src_sr // g
    y = resample_poly(x_f, up, down, axis=0)
    # Back to int16
    y = np.clip(y, -32768, 32767)
    return y.astype(np.int16)

if HAVE_SCIPY:
    x_22k = resample_np(x_mono, sr, 22_050)
    out_22k = DATA_DIR / "test_tone_22k_mono_np.wav"
    write_wav_np(out_22k, x_22k, 22_050)
    wav_info(out_22k)
else:
    print("Install scipy for resampling: pip install scipy")


## 8) Trim leading/trailing silence (threshold on absolute amplitude)

Works for mono or stereo. Threshold is in **int16 units** (0–32767).  
We scan in frames of `frame_len` samples for speed.


In [13]:
def trim_silence_int16(x: np.ndarray, threshold: int = 500, frame_len: int = 1024) -> np.ndarray:
    """
    Trim silence from start/end of an int16 waveform (mono or stereo).
    Silence if max(abs(frame)) <= threshold across all channels.
    """
    if x.ndim == 1:
        x = x[:, None]
    n = x.shape[0]
    # find start
    start = 0
    for i in range(0, n, frame_len):
        frame = x[i:i+frame_len]
        if frame.size == 0:
            break
        if np.max(np.abs(frame)) > threshold:
            start = i
            break
    # find end
    end = n
    for j in range(n, 0, -frame_len):
        frame = x[max(0, j-frame_len):j]
        if np.max(np.abs(frame)) > threshold:
            end = j
            break
    y = x[start:end]
    return y.astype(np.int16)

x_trim = trim_silence_int16(x_mono, threshold=200)
trim_wav = DATA_DIR / "test_tone_trimmed_np.wav"
write_wav_np(trim_wav, x_trim, sr)
wav_info(trim_wav)


{'channels': 1,
 'sampwidth': 2,
 'samplerate': 44100,
 'frames': 44100,
 'duration': 1.0}

## 9) Raw binary copy (any file) — no audio-specific APIs needed


In [14]:
src = TEST_WAV
dst = DATA_DIR / "test_tone_copy_np.wav"

with open(src, "rb") as f_in, open(dst, "wb") as f_out:
    while True:
        block = f_in.read(8192)
        if not block:
            break
        f_out.write(block)

dst.exists(), dst.stat().st_size == src.stat().st_size


(True, True)

## 10) Optional: broader format support (MP3/FLAC/OGG)

The standard library handles WAV/AIFF easily. For compressed formats:

- **soundfile** (`pip install soundfile`) — reads/writes WAV, FLAC, OGG (not MP3)
- **librosa** (`pip install librosa`) — convenient loading/resampling to float arrays
- **pydub** (`pip install pydub`, requires ffmpeg) — easy convert/export MP3↔WAV

Run the cells below only if you have these installed (or install first).


In [16]:
# soundfile example (FLAC/OGG/WAV). Not for MP3.
# !pip install soundfile
try:
    import soundfile as sf
    import numpy as np

    # Write a float WAV (stereo) from numpy
    sr = 48_000
    t = np.linspace(0, 1.0, sr, endpoint=False)
    tone_l = 0.3*np.sin(2*np.pi*440*t)
    tone_r = 0.3*np.sin(2*np.pi*660*t)
    stereo = np.stack([tone_l, tone_r], axis=1)  # shape (n, 2)
    sf_path = DATA_DIR / "sf_stereo_float.wav"
    sf.write(sf_path, stereo, sr, subtype="PCM_16")  # write as 16-bit PCM
    print("Wrote:", sf_path)

    # Read back (works for FLAC, OGG, WAV)
    data_arr, sr_read = sf.read(sf_path, always_2d=True)
    data_arr.shape, sr_read
except Exception as e:
    print("soundfile example skipped:", e)


soundfile example skipped: No module named 'soundfile'


In [17]:
# librosa example (convenient loader + resampling)
# !pip install librosa soundfile
try:
    import librosa, soundfile as sf

    y, sr = librosa.load(str(TEST_WAV), sr=None, mono=True)  # y: float32 [-1,1]
    y_22k = librosa.resample(y, orig_sr=sr, target_sr=22_050)
    out_lr = DATA_DIR / "librosa_resampled_22k.wav"
    sf.write(out_lr, y_22k, 22_050, subtype="PCM_16")
    out_lr, len(y), len(y_22k)
except Exception as e:
    print("librosa example skipped:", e)


librosa example skipped: No module named 'librosa'


In [18]:
# pydub example (needs ffmpeg installed in system PATH)
# !pip install pydub
try:
    from pydub import AudioSegment

    # Convert WAV -> MP3 -> WAV (if ffmpeg available)
    wav_path = TEST_WAV
    mp3_path = DATA_DIR / "tone.mp3"
    wav2_path = DATA_DIR / "tone_back_to_wav.wav"

    seg = AudioSegment.from_wav(wav_path)
    seg.export(mp3_path, format="mp3", bitrate="192k")
    seg2 = AudioSegment.from_mp3(mp3_path)
    seg2.export(wav2_path, format="wav")
    mp3_path.exists(), wav2_path.exists()
except Exception as e:
    print("pydub example skipped:", e)


pydub example skipped: No module named 'pydub'
