# Playing the audio file

In [57]:
from IPython.display import Audio
from scipy.io import wavfile
import numpy as np

In [53]:
file_name = '10 sec/Apple 10 slow.wav'

In [8]:
Audio(file_name)

In [9]:
data = wavfile.read(file_name)
framerate = data[0]
sounddata = data[1]
time = np.arange(0,len(sounddata))/framerate
print('Sample rate:',framerate,'Hz')
print('Total time:',len(sounddata)/framerate,'s')

Sample rate: 44100 Hz
Total time: 2.8792743764172335 s


# Generating Text from Audio

In [58]:
import soundfile as sf
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [59]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
import pandas
data=pandas.read_csv('apple_audio.csv')

In [80]:
transcript=[]
def travis(row):
    input_audio, _= librosa.load(row.file_name, sr=16000)  # sr=None loads with original sampling rate
    input_values = tokenizer(input_audio, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    transcript.append(transcription)
#return (run,wicket)
out=data.apply(lambda row: travis(row),axis=1)

data['transcription']=transcript
data

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
dtype: object

In [82]:
def sentence_accuracy(text1, text2):
    return text1==text2

# Calculate accuracy for each row
accuracies = []
for text1, text2 in zip(data['sentence'], data['transcription']):
    accuracy = sentence_accuracy(text1, text2)
    accuracies.append(accuracy)

# Add the accuracies as a new column in the DataFrame
data['sentence_level_accuracy'] = accuracies

# Calculate the overall average accuracy
average_accuracy = data['sentence_level_accuracy'].mean()

# Print the results
print(f"Average Accuracy: {average_accuracy:.2%}")

Average Accuracy: 35.00%


In [83]:
from difflib import SequenceMatcher

# Function to calculate sentence-level accuracy using SequenceMatcher ratio
def word_accuracy(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

# Calculate accuracy for each row
accuracies = []
for text1, text2 in zip(data['sentence'], data['transcription']):
    accuracy = word_accuracy(text1, text2)
    accuracies.append(accuracy)

# Add the accuracies as a new column in the DataFrame
data['word_level_accuracy'] = accuracies

# Calculate the overall average accuracy
average_accuracy = data['word_level_accuracy'].mean()

# Print the results
print(f"Average Accuracy: {average_accuracy:.2%}")


Average Accuracy: 88.27%


In [84]:
data

Unnamed: 0,file_name,sentence,transcription,sentence_level_accuracy,word_level_accuracy
0,10 sec/Apple 01 slow.wav,AN APPLE A DAY KEEPS THE DOCTOR AWAY,AN APPLE ER DAY GEEPS THE DOCTOR AWAY,False,0.931507
1,10 sec/Apple 02 slow.wav,APPLE SEEDS HAVE A TINY BIT OF POISON,APPLE SEDS HAVE A TINY BEAUTOR POISON,False,0.891892
2,10 sec/Apple 03 slow.wav,APPLE SEEDS SHOULD NOT BE EATEN,APPLE SATES SHOULD NOT BE EATEN,False,0.935484
3,10 sec/Apple 04 slow.wav,APPLE USUALLY GROWS IN HILLY AREAS,APPLE USUALLY GROW IN HILIIREAS,False,0.892308
4,10 sec/Apple 05 slow.wav,NEWTON SHOULD CATCH THAT APPLE,NU DON SOULD CATCH THAT APPLE,False,0.881356
5,10 sec/Apple 06 slow.wav,TODAY IS SUNDAY,TWO DAY IS SUNDAY,False,0.9375
6,10 sec/Apple 07 slow.wav,I AM TOO ABSENT,I AM TOO ABSENT,True,1.0
7,10 sec/Apple 08 slow.wav,DO NOT TRY THIS ANYWHERE,DO NOT TRY THIS ANYWHERE,True,1.0
8,10 sec/Apple 09 slow.wav,THIS IS NICE,THIS IS NICE,True,1.0
9,10 sec/Apple 10 slow.wav,I WOULD LOVE TO CHANGE THE WORLD,I WOULD LOVE TO CHANGE THE WORLD,True,1.0
