# Torchaudio
Advantages over other audio libraries -
- Feature extraction happens in GPU, hence it is very much more **efficient**

In [1]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

1.13.0.dev20221003
0.13.0.dev20221003


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import requests
import shutil

## Downloading and Saving Audio Files

In [3]:
SAMPLE_DIR = '../datasets/audio_datasets'

SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
SAMPLE_WAV_PATH = os.path.join(SAMPLE_DIR, "steam.wav")
SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3"
SAMPLE_MP3_PATH = os.path.join(SAMPLE_DIR, "steam.mp3")
SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
SAMPLE_WAV_SPEECH_PATH = os.path.join(SAMPLE_DIR, "speech.wav")
SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav"
SAMPLE_NOISE_PATH = os.path.join(SAMPLE_DIR, "bg.wav")

In [4]:
os.makedirs(SAMPLE_DIR, exist_ok=True) # Create SAMPLE_DIR folder

In [5]:
def fetch_audio_file(url, path):
  with open(path, 'wb') as file_:
    file_.write(requests.get(url).content) 
    # Will send http request to the urlwrite and download the files and store them locally

#### What does `fetch_audio_file` do?
- Sends http request to the url
- Downloads and writes the audio files in the path specified
- Stores them locally in the given path/folder

In [6]:
fetch_audio_file(SAMPLE_WAV_URL, SAMPLE_WAV_PATH)
fetch_audio_file(SAMPLE_MP3_URL, SAMPLE_MP3_PATH)
fetch_audio_file(SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH)
fetch_audio_file(SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH)

### Metadata collection
#### From downloaded path

In [8]:
metadata = torchaudio.info(SAMPLE_WAV_PATH)
print(metadata)

AudioMetaData(sample_rate=44100, num_frames=109368, num_channels=2, bits_per_sample=16, encoding=PCM_S)


#### From raw file itself

In [11]:
# Quering audio metadata directly from url
with requests.get(SAMPLE_WAV_URL, stream = True) as response:
    filedata = response.raw
    metadata = torchaudio.info(filedata, format = "wav")
    print(f'Fetched {filedata.tell()} bytes')
print(metadata)

Fetched 8192 bytes
AudioMetaData(sample_rate=44100, num_frames=109368, num_channels=2, bits_per_sample=16, encoding=PCM_S)


## Loading Audio Files
By default -
- `dtype = torch.float32`
- Range is **normalised** within [-1.0, 1.0]

In [12]:
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH) # Speech audio
# x, fs is returned
waveform.shape, sample_rate # PyTorch tensors

(torch.Size([1, 54400]), 16000)

**1**: Number of channels

**54400**: Number of frames/samples

**16000**: Sample rate 