# Importing the Relevant Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import librosa
import librosa.display
import soundfile as sf
import speech_recognition as sr

from jiwer import wer, cer
from IPython.display import Audio

import whisper

import csv
import os
import tempfile
import wave

from gtts import gTTS

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Loading and Visualizing an Audio File

In [None]:
audio_signal, sample_rate = librosa.load('speech_01.wav', sr=None)

In [None]:
sample_rate

In [None]:
plt.figure(figsize=(12, 4))
librosa.display.waveshow(audio_signal, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()

# Play the audio in the notebook
Audio('speech_01.wav')

# Transcribing Audio with Google Web Speech API

In [None]:
recognizer = sr.Recognizer()

In [None]:
file_path = 'speech_01.wav' 

In [None]:
def transcribe_audio(file_path):
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
        print(text)
        return text    
transcribed_text = transcribe_audio(file_path)

In [None]:
ground_truth = """My name is Ivan and I am excited to have you as part of our learning community! 
Before we get started, I’d like to tell you a little bit about myself. I’m a sound engineer turned data scientist,
curious about machine learning and Artificial Intelligence. My professional background is primarily in media production,
with a focus on audio, IT, and communications"""

In [None]:
calculated_wer = wer(ground_truth, transcribed_text)
calculated_cer = cer(ground_truth, transcribed_text)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

# Background Noise and Spectrograms

In [None]:
plt.figure(figsize=(12, 4))
librosa.display.waveshow(audio_signal, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()

# Play the audio in the notebook
Audio('speech_01.wav')

In [None]:
# Compute the spectrogram
S = librosa.stft(audio_signal)

In [None]:
S_dB = librosa.amplitude_to_db(abs(S), ref=np.max)

In [None]:
np.max(S_dB)

In [None]:
# Plot the spectrogram
plt.figure(figsize=(12, 4))
librosa.display.specshow(data = S_dB, sr=sample_rate, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

In [None]:
signal_filtered = librosa.effects.preemphasis(audio_signal, coef=0.97)
sf.write('filtered_speech_01.wav', signal_filtered, sample_rate)
output_file = 'filtered_speech_01.wav'

In [None]:
# Play the original audio file
print("Playing original audio:")
Audio(file_path)

In [None]:
# Play the filtered audio file
print("Playing filtered audio:")
Audio(output_file)

In [None]:
# Compute the spectrogram
Sb = librosa.stft(signal_filtered)

In [None]:
S_dBb = librosa.amplitude_to_db(abs(Sb), ref=np.max)

In [None]:
# Plot the spectrogram
plt.figure(figsize=(12, 4))
librosa.display.specshow(data = S_dBb, sr=sample_rate, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

In [None]:
transcribed_text_preemphasis = transcribe_audio('filtered_speech_01.wav')

In [None]:
calculated_wer = wer(ground_truth, transcribed_text_preemphasis)
calculated_cer = cer(ground_truth, transcribed_text_preemphasis)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

# Transciribing Audio with OpenAI's Whisper

In [None]:
model = whisper.load_model("base")

In [None]:
result = model.transcribe(file_path)

In [None]:
transcribed_text_whisper = result["text"]
transcribed_text_whisper

In [None]:
result["language"]

In [None]:
calculated_wer = wer(ground_truth, transcribed_text_whisper)
calculated_cer = cer(ground_truth, transcribed_text_whisper)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

# Transcribing Multiple Audio Files from a Directory

In [None]:
directory_path = "C:/Users/PC/Downloads/Speech Recognition/Recordings"

In [None]:
def transcribe_directory_whisper(directory_path):
    transcriptions = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".wav"):
            files_path = os.path.join(directory_path, file_name)
            # Transcribe the audio file
            result = model.transcribe(files_path)
            transcription = result["text"]
            transcriptions.append({"file_name": file_name, "transcription": transcription})
    return transcriptions

In [None]:
transcriptions = transcribe_directory_whisper(directory_path)

In [None]:
transcriptions

# Saving Audio Transcriptions to CSV for Easy Analysis

In [None]:
output_file = "transcriptions.csv"

with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Track Number", "File Name", "Transcription"])  # Write the header
    for number, transcription in enumerate(transcriptions, start=1):
        writer.writerow([number, transcription['file_name'], transcription['transcription']])

# Text-to-Speech

In [None]:
text = """Thank you for taking the time to watch our course on speech recognition!
This concludes the final lesson of this section. See you soon!"""

tts = gTTS(text=text, lang='en')
tts.save("output.mp3")

os.system("start output.mp3")