## Hotword detection

Here, we perform hotword detection on the `cv-valid-dev` split of the common voice dataset using our finetuned model. We follow the run-through provided in the sister notebook in `asr_project/asr-train/cv-train-2a.ipynb`  for this task.

In [1]:
import os
import re
import numpy as np
import pandas as pd

import torch
import torchaudio

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset, Audio

HOME_DIR = os.path.expanduser('~')


  from .autonotebook import tqdm as notebook_tqdm


## Table of Contents

1. [Transcription](#Transcription)
2. [Hotword detection](#Hotword-detection)

### Transcription

First, we load the finetuned model and the associated processor.

In [2]:
# Load finetuned model
tuned_model_path = os.path.join(
    HOME_DIR,
    'asr_project/asr-train/wav2vec2-large-960h-cv')
model = Wav2Vec2ForCTC.from_pretrained(tuned_model_path)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

Then, we reproduce the functions developed in `cv-train-2a.ipynb` here.

In [3]:
# Helpers
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
    return batch

# Function to convert mp3 to wav
def convert_mp3_to_wav(mp3_file):
    # Generate the output wav file path
    wav_file = mp3_file.replace('.mp3', '.wav')
    
    # Convert mp3 to wav if wav file does not exist
    if not os.path.exists(wav_file):
        waveform, sample_rate = torchaudio.load(mp3_file)
        torchaudio.save(wav_file, waveform, sample_rate)
    
    return wav_file

# Function to insert required tokens
start_token = "<s>"
end_token = "</s>"
word_delimiter_token = "|"
def preprocess_transcript(batch):
    transcript = batch['text']
    
    # Replace multiple spaces with a single space
    transcript = re.sub(r'\s+', ' ', transcript)
    
    # Add start and end tokens, and replace spaces with '|'
    processed_transcript = start_token + transcript.replace(" ", f"{word_delimiter_token}") + end_token
    
    return {"processed_text": processed_transcript}

def prepare_dataset(batch):
    # Process 'input_values' column for 1D waveform values
    batch["input_values"] = processor(batch["input_values"]["array"],
                                      sampling_rate=16000).input_values[0]
    
    # Process the 'labels' column to create 'labels' (text data)
    batch["labels"] = processor(text=batch["labels"]).input_ids
    
    return batch

def remove_start_end_tags_batch(batch):
    # Remove the <s> and </s> tags from both ends of each string in 'pred_str' and 'text'
    batch["pred_str"] = re.sub(r"^<s>|</s>$", "", batch["pred_str"])
    batch["text"] = re.sub(r"^<s>|</s>$", "", batch["text"])
    return batch


Next, we make sure that the required audio and text transcripts are in the required formats.

In [4]:
# Test set directories
audio_or_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-dev/')
audio_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-dev/cv-valid-dev/')
audioloc_transcript_or_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-dev.csv')
audioloc_transcript_dir = os.path.join(HOME_DIR,'asr_project/asr-train/selected_transcript-dev.csv')
temp_dir = os.path.join(HOME_DIR,'asr_project/asr-train/temp-dev.csv')

df = pd.read_csv(audioloc_transcript_or_dir)

# Convert mp3 to wav. Change mp3 file extension in df accordingly
df['filename'] = df['filename'].apply(
    lambda filename: convert_mp3_to_wav(
        os.path.join(audio_or_dir, filename)))

# Put texts to uppercase to match pre-finetuned model
df['text'] = df['text'].str.upper()
df['filename'] = df['filename'].map(lambda x: os.path.basename(x))

df_transcript = df[['filename','text']]
df_transcript.to_csv(audioloc_transcript_dir,index=False)

Following that, we load the transcripts and audio files, preprocessing as needed before inference.

In [5]:
# Load csv file with wav filenames, complete path and create dataset
df = pd.read_csv(audioloc_transcript_dir)
df['filename'] = df['filename'].map(lambda x: os.path.join(audio_dir,x))
df.to_csv(temp_dir,index=False)

dataset = load_dataset('csv', data_files=temp_dir, split='train')
dataset = dataset.cast_column("filename",
                              Audio(sampling_rate=16000))         # Cast audio files with 16kHz sampling rate

# Following the style of facebook/wav2ec2-large-960h model
start_token = "<s>"
end_token = "</s>"
word_delimiter_token = "|"

# Apply the preprocessing to both train and validation splits
dataset = dataset.map(preprocess_transcript, remove_columns=["text"],num_proc=4)
dataset = dataset.rename_column("filename", "input_values")
dataset = dataset.rename_column("processed_text", "labels")

# Map the dataset transformation to both 'train' and 'val' splits
dataset = dataset.map(prepare_dataset, num_proc=2)

Generating train split: 4076 examples [00:00, 621152.60 examples/s]
Map (num_proc=4): 100%|██████████| 4076/4076 [00:00<00:00, 35817.58 examples/s]
Map (num_proc=2): 100%|██████████| 4076/4076 [00:04<00:00, 901.76 examples/s] 


Finally, we run inference using the finetuned model loaded earlier.

In [6]:
# Use cuda if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

results = dataset.map(map_to_result, remove_columns=dataset.column_names)

# Apply the function to the entire dataset
results = results.map(remove_start_end_tags_batch)

Map: 100%|██████████| 4076/4076 [01:41<00:00, 40.23 examples/s]
Map: 100%|██████████| 4076/4076 [00:00<00:00, 55859.00 examples/s]


We save a copy of the transcribed texts along with the filenames for later use.

In [None]:
# Create dataframe from results
df = results.to_pandas()
df_files = pd.read_csv(audioloc_transcript_dir)                           # To get filenames
df = pd.concat([df_files['filename'],df['pred_str']],axis=1)
df['filename'] = df['filename'].map(lambda x: x.replace('.wav','.mp3'))   # To convert filenames to original mp3 type.

# Save as csv
new_transcription_path = os.path.join(HOME_DIR,'asr_project/hotword-detection/new_transcription.csv')
df.to_csv(new_transcription_path, index=False)

### Hotword Detection

Here, we check for the presence of the following hotwords: "be careful", "destroy" and "stranger". We first load the transcribed text. We find a broken audio clip which yielded no transcription and drop it from the set.

In [8]:
# Load transcribed text
new_transcription_path = os.path.join(HOME_DIR,'asr_project/hotword-detection/new_transcription.csv')
df_raw = pd.read_csv(new_transcription_path)
df_raw['pred_str'] = df_raw['pred_str'].str.lower() # Convert to lowercase for readability.
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4076 entries, 0 to 4075
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4076 non-null   object
 1   pred_str  4075 non-null   object
dtypes: object(2)
memory usage: 63.8+ KB


In [9]:
df_raw.dropna(inplace=True)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4075 entries, 0 to 4075
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  4075 non-null   object
 1   pred_str  4075 non-null   object
dtypes: object(2)
memory usage: 95.5+ KB


Then, we search through each transcription and pick up entries with the hotword.

In [10]:
df = df_raw.copy()
hotwords = ['be careful', 'destroy', 'stranger']
df['hotword_present'] = 0                              # Initialize hotword indicator

for hotword in hotwords:
    for ind, row in df.iterrows():
        df.loc[ind,'hotword_present'] = row['hotword_present'] + (hotword in row['pred_str'])

df['hotword_present'] = df['hotword_present'].map(lambda x: x>0)
df = df.loc[df['hotword_present'],:]
df

Unnamed: 0,filename,pred_str,hotword_present
0,sample-000000.mp3,be careful with your prognostications said the...,True
3,sample-000003.mp3,i felt that everything i owned would be destroyed,True
89,sample-000089.mp3,the stranger seemed satisfied ith the answer,True
508,sample-000508.mp3,i had to test your courage the stranger said,True
674,sample-000674.mp3,i had to test your courage the stranger said,True
1093,sample-001093.mp3,be careful with your prognostications said the...,True
1101,sample-001101.mp3,the stranger was speaking of things that very ...,True
1243,sample-001243.mp3,the stranger was speaking of things that very ...,True
1501,sample-001501.mp3,i had to test your courage the stranger said,True
1933,sample-001933.mp3,the stranger seemed satisfied with the answer,True


In [11]:
# Quick check to ensure that whitespaces from transcription typo did not result in missed entries.
df_check = df_raw.copy()
hotwords = ['becareful', 'destroy', 'stranger']
df_check['hotword_present'] = 0                              # Initialize hotword indicator
df_check['pred_str'] = df_check['pred_str'].str.replace(r'\s+', '', regex=True)

for hotword in hotwords:
    for ind, row in df.iterrows():
        df_check.loc[ind,'hotword_present'] = row['hotword_present'] + (hotword in row['pred_str'])

df_check['hotword_present'] = df_check['hotword_present'].map(lambda x: x>0)
df_check = df_check.loc[df_check['hotword_present'],:]
df_check

Unnamed: 0,filename,pred_str,hotword_present
0,sample-000000.mp3,becarefulwithyourprognosticationssaidthestranger,True
3,sample-000003.mp3,ifeltthateverythingiownedwouldbedestroyed,True
89,sample-000089.mp3,thestrangerseemedsatisfiediththeanswer,True
508,sample-000508.mp3,ihadtotestyourcouragethestrangersaid,True
674,sample-000674.mp3,ihadtotestyourcouragethestrangersaid,True
1093,sample-001093.mp3,becarefulwithyourprognosticationssaidthestranger,True
1101,sample-001101.mp3,thestrangerwasspeakingofthingsthatveryfewpeopl...,True
1243,sample-001243.mp3,thestrangerwasspeakingofthingsthatveryfewpeopl...,True
1501,sample-001501.mp3,ihadtotestyourcouragethestrangersaid,True
1933,sample-001933.mp3,thestrangerseemedsatisfiedwiththeanswer,True


We save the mp3s with hotwords in `detected.txt` (__task 5a__).

In [12]:
detected_path = os.path.join(HOME_DIR, 'asr_project/hotword-detection/detected.txt')
with open(detected_path, 'w') as f:
    for filename in df['filename'].to_list():
        f.write(filename)
        f.write('\n')