In [2]:
import sentencepiece as spm
import librosa
import numpy as np
from pydub import AudioSegment
import speech_recognition as sr
import os
from pydub import AudioSegment

In [None]:
## take audio and transcribe audio to text file
recognizer = sr.Recognizer()

# Load audio
audio_folder = "D:/coding/ai_assistant/training_data"
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".mp3")]
transcription_folder = "D:/coding/ai_assistant/transcriptions" 

for audio_file in audio_files:
    audio_path = os.path.join(audio_folder, audio_file)
    audio_path = audio_path.replace("\\", "/")

    #convert mp3 to wav for sr libary
    audio = AudioSegment.from_mp3(audio_path)
    wav_path = audio_path.replace(".mp3", ".wav")
    audio.export(wav_path, format="wav")
    
    # transcribe audio to text
    with sr.AudioFile(wav_path) as source:
        audio = recognizer.record(source)  # record the  audio file
        transcription = recognizer.recognize_sphinx(audio)  
    
    # save the transcription to a text file
    transcription_file_name = f"transcription_{audio_file}.txt"
    transcription_file_path = os.path.join(transcription_folder, transcription_file_name)

    with open(transcription_file_path, "w") as file:
        file.write(transcription)

    print(f"Transcription {audio_file} saved to {transcription_file_path}")

Transcription twentyyearsonhorseback_00_weekley_64kb.mp3 saved to D:/coding/ai_assistant/transcriptions\transcription_twentyyearsonhorseback_00_weekley_64kb.mp3.txt
Transcription twentyyearsonhorseback_01_weekley_64kb.mp3 saved to D:/coding/ai_assistant/transcriptions\transcription_twentyyearsonhorseback_01_weekley_64kb.mp3.txt
Transcription twentyyearsonhorseback_02_weekley_64kb.mp3 saved to D:/coding/ai_assistant/transcriptions\transcription_twentyyearsonhorseback_02_weekley_64kb.mp3.txt


In [None]:
## take audio and transcription - create an alignment of the pair
audio = AudioSegment.from_file("audiobook.mp3")

# Step 2: Text-Audio Alignment (using Aeneas)
# Assume the alignments are obtained and saved in a list of tuples: alignments
# Each tuple contains the start and end times (in milliseconds) for each audio chunk
# alignments = [(start1, end1), (start2, end2), ...]

# Also, assume transcription.txt contains the corresponding text for each audio chunk
# Each line in transcription.txt corresponds to an audio chunk

# Step 3: Train SentencePiece model
spm.SentencePieceTrainer.train('--input=transcription.txt --model_prefix=m --vocab_size=2000')
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# Step 4: Preprocess Data
audio_chunks = [audio[start:end] for start, end in alignments]

# Read the text data from transcription.txt
with open('transcription.txt', 'r') as file:
    texts = [line.strip() for line in file]

# Tokenize text
tokenized_texts = [sp.encode(text, out_type=str) for text in texts]

# Step 5: Feature Engineering
def extract_features(audio_chunk):
    samples = np.array(audio_chunk.get_array_of_samples())
    spectrogram = librosa.feature.melspectrogram(y=samples, sr=audio_chunk.frame_rate)
    return spectrogram

spectrograms = [extract_features(audio_chunk) for audio_chunk in audio_chunks]

# Step 6: Create Dataset
dataset = list(zip(spectrograms, texts, tokenized_texts))

# Step 7: Model Selection and Training
# Choose a TTS model architecture (e.g., Tacotron 2)
# Set up the training environment and train the model on the prepared dataset
# The exact steps will depend on the TTS framework and model you choose

# Step 8: Evaluation
# Evaluate the performance of your TTS model
# The evaluation steps will also depend on the TTS framework and model you choose

