## Fetching the Dataset

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"anujanjirwala","key":"d455ae22d205af474a7016f08f806735"}') 

!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d rtatman/speech-accent-archive
!unzip -qo 'speech-accent-archive.zip' -d '/content/data'

## Load the pretrained Jasper Model

In [None]:
# Download acoustic models
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.tflite -nv
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm -nv

!pip install deepspeech
!pip install pydub

In [None]:
import deepspeech

# create DeepSpeech model object
model = deepspeech.Model('/content/deepspeech-0.9.3-models.pbmm')


In [None]:
from pydub import AudioSegment

# load audio file
audio = AudioSegment.from_file('/content/data/recordings/recordings/afrikaans1.mp3', format='mp3')


In [None]:
import numpy as np
# convert audio to 16-bit PCM format
audio = audio.set_frame_rate(16000).set_channels(1)

# get raw audio data as bytes
input_portion = np.array(audio.get_array_of_samples(), dtype=np.int16)

In [None]:
# transcribe audio using DeepSpeech model
text = model.stt(input_portion)
print(text)


## Define the two metrics

In [None]:
!pip install jiwer

In [None]:
import jiwer

In [None]:
f = open("/content/data/reading-passage.txt")
target = ""
for line in f:
  target += line

text_transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.RemovePunctuation(),
    # jiwer.ReduceToSingleSentence(),
    jiwer.Strip(),
    jiwer.SubstituteRegexes({r"6": r"six", r"5": r"five", r"3": r"three"}),
    jiwer.ReduceToListOfListOfWords()
]) 

def wer(output):
  return jiwer.wer(
    target, 
    output, 
    truth_transform=text_transformation, 
    hypothesis_transform=text_transformation)
  
def wil(output):
  return jiwer.wil(
    target, 
    output, 
    truth_transform=text_transformation, 
    hypothesis_transform=text_transformation)


## Transcribe and recording the data

In [None]:
# open the speaker_all.csv
import pandas as pd
from tqdm.auto import tqdm

In [None]:
speakers = pd.read_csv('/content/data/speakers_all.csv')
speakers['wer'] = 1.0
speakers['wil'] = 1.0
cnt = 0
batch_bar   = tqdm(total=len(speakers), dynamic_ncols=True, leave=False, position=0)
for index, row in speakers.iterrows():
  if row['file_missing?']==False:
    audio = AudioSegment.from_file('/content/data/recordings/recordings/'+row['filename']+'.mp3', format='mp3')
    transcription = model.stt(np.array(audio.set_frame_rate(16000).set_channels(1).get_array_of_samples(), dtype=np.int16))
    speakers.at[index,'wer'] = wer(transcription)
    speakers.at[index,'wil'] = wil(transcription)
  batch_bar.update()
batch_bar.close()
speakers.to_csv('results.csv')
