In [1]:
%%script echo
%pip install transformers
%pip install tf-keras
%pip uninstall -y tensorflow tensorflow-macos tensorflow-metal keras
%pip install --upgrade pip
%pip install tensorflow==2.16.1 tensorflow-metal==1.1.0 keras==3.0.0 
%pip install pydub
# install ffmpeg: pydub relies on ffprobe to process audio files like MP3




#### Library Imports

In [2]:
import os                                        # Operating system-related functions (file, directory operations)
import sys                                       # Provides access to system-specific parameters and functions
import platform                                  # Provides functions and information about the operating system and hardware

import numpy as np                               # numerical operations & array manipulation
import pickle                                    # for object serialization & deserialization
import random                                    # generates random numbers
import math
import time

import matplotlib.pyplot as plt                  # creating plots & visualizations

# import cv2                                       # computer vision & image processing
import tensorflow as tf                          # deep learning framework
import keras                                     # training & evaluating deep learning models
import transformers

from tqdm import tqdm                            # creating progress bars in loops
from typing import Dict, List, Tuple, Optional   # type hinting

In [3]:
#%conda --version
%pip --version

pip 24.3.1 from /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pip (python 3.11)
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Check Operating System & Platform
print('Operating System:', platform.system(), platform.release())
print('Python Platform:', platform.platform())

# Check Python Version
print('Python Version:', sys.version)
print()

# Print library versions
print('NumPy Version:', np.__version__)
# print('OpenCV Version:', cv2.__version__)
print('TensorFlow Version:', tf.__version__)
print('Keras Version:', keras.__version__)
print('Transformers Version:', transformers.__version__)


Operating System: Darwin 24.0.0
Python Platform: macOS-15.0-arm64-arm-64bit
Python Version: 3.11.4 (v3.11.4:d2340ef257, Jun  6 2023, 19:15:51) [Clang 13.0.0 (clang-1300.0.29.30)]

NumPy Version: 1.26.3
TensorFlow Version: 2.16.1
Keras Version: 3.0.0
Transformers Version: 4.47.0


#### Task 2

Create a hosted microservice to deploy an Automatic Speech Recognition (ASR) AI model that can be used to transcribe any audio files.

**2a**

AI model to use: wav2vec2-large-960h
https://huggingface.co/facebook/wav2vec2-large-960h
This model is developed by Facebook and pretrained and fine-tuned on Librispeech dataset on 16kHz sampled speech audio. Please ensure that your speech input is also sampled at 16kHz. The reference link (above) includes the model card and its usage code.

In [5]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline
# pipe = pipeline('automatic-speech-recognition', model='facebook/wav2vec2-large-960h')

# Load model directly
from transformers import AutoProcessor, AutoModelForCTC

# Load processor and model using downloaded paths
processor = AutoProcessor.from_pretrained('facebook/wav2vec2-large-960h')
model = AutoModelForCTC.from_pretrained('facebook/wav2vec2-large-960h')

model_dir = 'model'
processor.save_pretrained(model_dir)
model.save_pretrained(model_dir)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from pydub import AudioSegment
import subprocess

def get_sample_rate(file_path: str) -> int:
    '''
    Get the sample rate of an audio file using ffprobe.
    '''
    try:
        command = [
            'ffprobe',
            '-v', 'error',
            '-select_streams', 'a:0',
            '-show_entries', 'stream=sample_rate',
            '-of', 'default=noprint_wrappers=1:nokey=1',
            file_path
        ]
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return int(result.stdout.strip())
    except Exception as e:
        print(f'Error retrieving sample rate: {e}')
        return -1

audio_dir = 'audio'
ding_dir = 'harvard.wav'
file_dir = os.path.join(audio_dir, ding_dir)

# Get the current sample rate
current_sample_rate = get_sample_rate(file_dir)
print(f'Current sample rate: {current_sample_rate / 1000}kHz')

# Load the MP3 file
audio = AudioSegment.from_file(file_dir, format='mp3')

# Convert only if the sample rate is not 16kHz
if current_sample_rate != 16000:
    audio = audio.set_frame_rate(16000).set_channels(1)  # Ensure mono audio
    converted_path = os.path.join('audio', 'converted_audio_16kHz.mp3')
    audio.export(converted_path, format='mp3')
    print('Audio converted to 16kHz.')
else:
    print('Audio is already at 16kHz. No conversion needed.')

Current sample rate: 44.1kHz
Audio converted to 16kHz.


#### Test ASR Model

In [7]:
import torch
from pydub import AudioSegment

def load_audio(audio_path: str) -> tuple[torch.Tensor, int, str]:
    '''
    Load and preprocess the audio file for Wav2Vec2 model.
    Args:
        audio_path (str): Path to the audio file.
    Returns:
        torch.Tensor: Preprocessed audio tensor.
        int: Sampling rate of the audio.
    '''
    audio = AudioSegment.from_file(audio_path, format='mp3')
    sample_rate = audio.frame_rate
    # sample_rate = get_sample_rate(audio_path)
    print('Audio Sampling Rate:', sample_rate)

    # Convert to 16kHz using pydub if needed
    if sample_rate != 16000 and sample_rate > 0: 
        audio = audio.set_frame_rate(16000).set_channels(1)  # Ensure mono and 16kHz
        converted_path = os.path.join('audio', 'converted_audio_16kHz.mp3')
        audio.export(converted_path, format='mp3')
        print('Audio converted to 16kHz.')
        audio_dir = converted_path
    
    audio = AudioSegment.from_file(audio_dir, format='mp3')
    new_sample_rate = audio.frame_rate

    # Convert to 16-bit PCM samples
    samples = audio.get_array_of_samples()    
    # Convert to a PyTorch tensor
    waveform = torch.tensor(samples, dtype=torch.float32) / (2 ** 15)  # Normalize to [-1, 1]
    
    # Ensure waveform shape is compatible
    if audio.channels > 1:
        waveform = waveform.reshape(audio.channels, -1).mean(dim=0)  # Downmix to mono

    return waveform.unsqueeze(0), new_sample_rate, audio_dir

def transcribe_audio(audio_path: str, model_dir: str) -> str:
    '''
    Transcribe the given audio file using a pre-trained Wav2Vec2 model.
    Args:
        audio_path (str): Path to the audio file.
        model_dir (str): Directory containing the saved model and processor.
    Returns:
        str: Transcription of the audio.
    '''
    # Load the processor and model
    processor = AutoProcessor.from_pretrained(model_dir)
    model = AutoModelForCTC.from_pretrained(model_dir)

    # Load and preprocess the audio
    waveform, sample_rate, audio_dir = load_audio(audio_path)

    # Prepare inputs for the model
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors='pt', padding=True)

    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode the predicted IDs to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    return transcription[0]

# File paths
model_dir = 'model'
audio_file_1 = 'dingSFX-1.mp3'
audio_file_2 = 'harvard.wav'

# Transcript Audio File 1: dingSFX-1.mp3
audio_path = os.path.join(audio_dir, audio_file_1)
transcription = transcribe_audio(audio_path, model_dir)
print(f'Transcription: {transcription}')

# Transcript Audio File 2: harvard.wav
audio_path = os.path.join(audio_dir, audio_file_2)
transcription = transcribe_audio(audio_path, model_dir)
print(f'Transcription: {transcription}')

Audio Sampling Rate: 44100
Audio converted to 16kHz.
Transcription: 
Audio Sampling Rate: 44100
Audio converted to 16kHz.
Transcription: THE STALE SMELL OF OLD BEER LINGERS IT TAKES HEAT TO BRING OUT THE ODOUR A COLD DIP RESTORES HEALTH AND ZEST A SALT PICKLE TASTES FINE WITH HAM TACCOS AL PASTORE ARE MY FAVORITE A ZESTFUL FOOD IS THE HOT CROSS BUN
