In [None]:
import sys
sys.path.insert(0, "..")

from dotenv import load_dotenv
from stt.utils import (
    convert_to_wav,
    initialise_azure_openai_speech, 
    initialise_azure_openai_chat,
    read_text_file,
    write_to_file_with_line_breaks,
    convert_any_to_wav,
    calculate_word_error_rate
)
from stt.transcribe import (
    transcribe_audio,
)
from stt.chat import(
    response_review_transcription,
    response_repair_report
)
import os
from pydub import AudioSegment
import pandas as pd
import matplotlib.pyplot as plt
from openai import AzureOpenAI

load_dotenv()

In [None]:
path_to_raw_files = "../data/samples/raw_m4a_files/"
os.listdir(path_to_raw_files)

In [None]:
path_to_processed_files = "../data/samples/converted_wav_files/"
#os.makedirs(path_to_processed_files)

In [None]:
for file in raw_file_names:
    convert_any_to_wav(f"{path_to_raw_files}{file}", f"{path_to_processed_files}{file.split('.')[0]}.wav")

In [None]:
processed_file_names = os.listdir(path_to_processed_files)

In [None]:
path_raw_transcripts = "../data/transcripts/raw/"
#os.makedirs(path_raw_transcripts)

In [None]:
transcript_ex = transcribe_audio(f"{path_to_processed_files}{processed_file_names[0]}")
print(transcript_ex)

In [None]:
for audio_file in processed_file_names:
    audio_file_path = f"{path_to_processed_files}{audio_file}"
    transcript_ex = transcribe_audio(audio_file_path)
    raw_transcript_path_name = f"{path_raw_transcripts}{audio_file.split('.')[0]}_raw_transcript.txt"
    write_to_file_with_line_breaks(transcript_ex, raw_transcript_path_name)

In [None]:
ex_raw_transcript_path = f"{path_raw_transcripts}{os.listdir(path_raw_transcripts)[0]}"


In [None]:
os.listdir(path_raw_transcripts)[0].replace('raw', 'corrected')

In [None]:
ex_raw_transcript_path

In [None]:
ex_raw_transcript = read_text_file(ex_raw_transcript_path)

In [None]:
ex_raw_transcript

In [None]:
path_corrected_transcripts = "../data/transcripts/corrected/"
#os.makedirs(path_corrected_transcripts)

In [None]:
corrected_transcript = response_review_transcription(ex_raw_transcript)
write_to_file_with_line_breaks(
    corrected_transcript, 
    f"{path_corrected_transcripts}{os.listdir(path_raw_transcripts)[0].replace('raw', 'corrected')}")

In [None]:
raw_transcript_file_names = os.listdir(path_raw_transcripts)
raw_transcript_file_names

In [None]:
for raw_file_name in raw_transcript_file_names:
    raw_transcript_path = f"{path_raw_transcripts}{raw_file_name}"
    raw_transcript = read_text_file(raw_transcript_path)
    corrected_transcript = response_review_transcription(raw_transcript)
    corrected_file_name = raw_file_name.replace('raw', 'corrected')
    write_to_file_with_line_breaks(
        corrected_transcript, 
        f"{path_corrected_transcripts}{corrected_file_name}")


In [None]:
proofreading_prompt = "You are a helpful assistant for the public transit authority HTM. Your task is to correct any spelling discrepancies in the transcribed text. Make sure that the names of the following systems and streets are spelled correctly: {vocabulary}. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided. Add ** between every word that is corrected."

def generate_corrected_transcript(temperature, system_prompt, vocabulary, transcript):
    client = initialise_azure_openai_chat()
    response = client.chat.completions.create(
        model=os.getenv("AZURE_GPT_DEPLOYMENT_NAME"),
        temperature=temperature,
        messages=[
            {
                "role": "system",
                "content": system_prompt.format(vocabulary=vocabulary)
            },
            {
                "role": "user",
                "content": f"Here is the text to be corrected: {transcript}"
            },
        ],
    )
    return response.choices[0].message.content

In [None]:
vocabulary = pd.read_csv("../data/context/woordenlijst_werk_termen_20240304.csv", sep=";")['words'].tolist()[:10000]
vocabulary

In [None]:
transcript_ex = read_text_file(f"{path_to_raw_transcripts_files}{raw_transcript_file_names[0]}")
transcript_ex = transcript_ex.strip().replace("\n", "").replace("  ", " ")
corrected_text = generate_corrected_transcript(0, proofreading_prompt, vocabulary, transcript_ex)
print(corrected_text)

In [None]:
path_to_raw_transcripts_files = "../data/transcripts/raw/"
path_to_gold_transcripts_files = "../data/transcripts/gold/"

In [None]:
os.listdir(path_to_raw_transcripts_files)

In [None]:
os.listdir(path_to_gold_transcripts_files)

In [None]:
raw_transcript_file_names = os.listdir(path_to_raw_transcripts_files)
gold_transcript_file_names = os.listdir(path_to_gold_transcripts_files)

In [None]:
raw_transcript_file_names.sort()
gold_transcript_file_names.sort()


In [None]:
raw_transcript_file_names

In [None]:
gold_transcript_file_names

In [None]:
transcript_scores = {}
transcript_directory = "/"
#transcript_scores['raw_path'] = os.listdir(path_to_raw_transcripts_files)
transcript_scores['raw_name'] = [raw_path.split('_raw')[0] for raw_path in raw_transcript_file_names]
#transcript_scores['gold_path'] = os.listdir(path_to_gold_transcripts_files)
transcript_scores['gold_name'] = [gold_path.split('_gold')[0] for gold_path in gold_transcript_file_names]
transcript_scores['raw_text'] = [read_text_file(f"{path_to_raw_transcripts_files}{raw_file}") for raw_file in raw_transcript_file_names]
transcript_scores['gold_text'] = [read_text_file(f"{path_to_gold_transcripts_files}{gold_file}") for gold_file in gold_transcript_file_names]



In [None]:
transcript_scores_df = pd.DataFrame(transcript_scores)

In [None]:
for col in ['raw_text', 'gold_text']:
     transcript_scores_df[col] = transcript_scores_df[col].str.replace('\n', ' ')

In [None]:
transcript_scores_df['wer'] = transcript_scores_df.apply(lambda x: calculate_word_error_rate(x.gold_text, x.raw_text), axis = 1)

In [None]:
transcript_scores_df['accuracy'] = 1 - transcript_scores_df['wer']

In [None]:
transcript_scores_df.to_csv("transcript_scores_data.csv", sep=';', decimal=',')

In [None]:
transcript_scores_df.plot()

In [None]:
transcript_scores_df.describe()

In [None]:
print(read_text_file("../src/stt/prompts/storingstemplate.txt"))
#/Users/YaronMcNabb_1/Documents/htm-speech-to-text-app/src/stt/prompts/stroingstemplate.txt

In [None]:
system_prompt_path = "../src/stt/prompts/system_prompt.txt"
human_prompt_path = "../src/stt/prompts/storingstemplate.txt"

system_prompt = read_text_file(system_prompt_path)
human_prompt = read_text_file(human_prompt_path)
transcript = read_text_file("../data/transcripts/raw/1_766414_raw_transcript.txt")

In [None]:
werklograppot = response_repair_report("../src/stt/prompts/system_prompt.txt",
                       "../src/stt/prompts/storingstemplate.txt",
                       "../data/transcripts/raw/1_766414_raw_transcript.txt")

In [None]:
print(werklograppot)

In [None]:
write_to_file_with_line_breaks(werklograppot, "../data/reports/1_766414.txt")

In [None]:
raw_transcript_file_names

In [None]:
def create_write_report(system_prompt_path, human_prompt_path, transcript_path, report_path):
    report = response_repair_report(system_prompt_path, human_prompt_path, transcript_path)
    write_to_file_with_line_breaks(report, report_path)

In [None]:
for transcript in raw_transcript_file_names:
    transcript_path = f"../data/transcripts/raw/{transcript}"
    report_path = f"../data/reports/{transcript.split('_raw')[0]}.txt"
    create_write_report(system_prompt_path, human_prompt_path, transcript_path, report_path)

In [None]:
raw_transcript_file_names

In [None]:
transcript in raw_transcript_file_names:
    transcript_path = f"../data/transcripts/raw/{transcript}"
    report_path = f"../data/reports/{transcript.split('_raw')[0]}.txt"
    create_write_report(system_prompt_path, human_prompt_path, transcript_path, report_path)

In [None]:
transcript = '2_300099_raw_transcript.txt'
transcript_path = f"../data/transcripts/raw/{transcript}"
report_path = f"../data/reports/{transcript.split('_raw')[0]}.txt"
create_write_report(system_prompt_path, human_prompt_path, transcript_path, report_path)