In [3]:
import sentencepiece as spm
import librosa
import numpy as np
from pydub import AudioSegment
import speech_recognition as sr
import os
import requests
import json
import csv
from pydub import AudioSegment
import pandas as pd
import subprocess
from sklearn.model_selection import train_test_split
import yaml

In [None]:
## take audio and transcribe audio to text file
recognizer = sr.Recognizer()


audio_folder = "D:/coding/ai_assistant/training_data"
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".mp3")]
transcription_folder = "D:/coding/ai_assistant/transcriptions" 

for audio_file in audio_files:
    audio_path = os.path.join(audio_folder, audio_file)
    audio_path = audio_path.replace("\\", "/")

    #convert mp3 to wav for sr libary
    audio = AudioSegment.from_mp3(audio_path)
    wav_path = audio_path.replace(".mp3", ".wav")
    audio.export(wav_path, format="wav")
    
    # transcribe audio to text
    with sr.AudioFile(wav_path) as source:
        audio = recognizer.record(source)  # record the  audio file
        transcription = recognizer.recognize_sphinx(audio)  
    
    # save the transcription to a text file
    transcription_file_name = f"transcription_{audio_file}.txt"
    transcription_file_path = os.path.join(transcription_folder, transcription_file_name)

    with open(transcription_file_path, "w") as file:
        file.write(transcription)

    print(f"Transcription {audio_file} saved to {transcription_file_path}")

In [6]:
url = 'http://localhost:32768/transcriptions?async=false'
audio_folder = "D:/coding/ai_assistant/audio_data"
transcript_folder = "D:/coding/ai_assistant/transcriptions"
output_folder = "D:/coding/ai_assistant/audio_alignment"

audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]

for audio_file in audio_files:
    # get the transcript filename from the audio filename
    transcript_file = "transcription_" + audio_file.replace(".wav", ".mp3") + ".txt"
    
    # full paths to the audio and transcript files
    audio_path = os.path.join(audio_folder, audio_file)
    transcript_path = os.path.join(transcript_folder, transcript_file)
 
    if os.path.exists(transcript_path):
        # request to Gentle api
        with open(audio_path, 'rb') as audio, open(transcript_path, 'r') as transcript:
            response = requests.post(url, files={'audio': audio, 'transcript': transcript})
     
        if response.status_code == 200:
            alignment_data = response.json()
            
            # output file path
            output_file = f"alignment_{audio_file.replace('.wav', '.json')}"
            output_path = os.path.join(output_folder, output_file)
        
    
            with open(output_path, 'w') as outfile:
                json.dump(alignment_data, outfile, indent=4)
            
            print(f"Alignment data saved to {output_path}")
        else:
            print(f'Failed to process alignment: {response.status_code}') 

Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_00_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_01_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_02_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_03_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_04_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_05_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_06_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\alignment_twentyyearsonhorseback_07_weekley_64kb.json
Alignment data saved to D:/coding/ai_assistant/audio_alignment\a

In [14]:
# create dataset
txt_folder = "D:/coding/ai_assistant/dataset/transcriptions"
wavs_folder = "D:/coding/ai_assistant/dataset/wav"
output_csv_file = "D:/coding/ai_assistant/dataset/metadata.csv"  # Fixed the path

def get_transcription(txt_file_path):
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        return f.read().strip()

# Create metadata.csv file
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['wav_file', 'transcription']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='|')

    # Loop through each wav file in the wavs folder
    for wav_file_name in os.listdir(wavs_folder):
        wav_file_path = os.path.join(wavs_folder, wav_file_name)
        txt_file_name = "transcription_" + wav_file_name.replace(".wav", ".txt")  # Fixed the extension to .txt
        txt_file_path = os.path.join(txt_folder, txt_file_name)

        # Check if the corresponding txt file exists
        if os.path.exists(txt_file_path):
            transcription = get_transcription(txt_file_path)
            writer.writerow({'wav_file': wav_file_path, 'transcription': transcription})  # Use wav_file_path to include the full path
        else:
            print(f"No matching txt file found for {wav_file_name}")

print(f'csv file has been created at {output_csv_file}')

Metadata.csv file has been created at D:/coding/ai_assistant/dataset/metadata.csv


In [8]:
#### convert and split data 
csv_path = "D:/coding/ai_assistant/dataset/metadata.csv"
data = pd.read_csv(csv_path, delimiter='|', header=None, names=['wav_file', 'transcription'])


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

meta_txt = "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/preprocessed_data/MyDataset/metadata.txt"
train_txt = "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/preprocessed_data/MyDataset/train.txt"
val_txt = "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/preprocessed_data/MyDataset/val.txt"


train_data.to_csv(train_txt, sep='|', index=False, header=False)
val_data.to_csv(val_txt, sep='|', index=False, header=False)


In [None]:
######## update preprocess yaml file
def train_yaml(file_path, new_values):

    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)

    for key, value in new_values.items():
        if key in config:
            config[key].update(value)
        else:
            config[key] = value

    with open(file_path, 'w') as file:
        yaml.safe_dump(config, file)

# new values
new_values_train = {
    'path': {
        'ckpt_path': "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/output/ckpt/MyDataset",
        'log_path': "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/output/log/MyDataset",
        'result_path': "D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/output/result/MyDataset"
    }
}

# Update the train.yaml file
train_yaml("D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/config/LJSpeech/train.yaml", new_values_train)

In [4]:
######## update preprocess yaml file
def pre_yaml(file_path, new_values):

    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)

    for key, value in new_values.items():
        if key in config:
            config[key].update(value)
        else:
            config[key] = value

    with open(file_path, 'w') as file:
        yaml.safe_dump(config, file)

# new values 
new_values_preprocess = {
    'path': {
        'corpus_path': "D:/coding/ai_assistant/dataset",
        'raw_path': "./raw_data/MyDataset",
        'preprocessed_path': "./preprocessed_data/MyDataset"
    }
}

pre_yaml("D:/coding/ai_assistant/TensorFlowTTS/FastSpeech2/config/LJSpeech/preprocess.yaml", new_values_preprocess)

In [7]:
import subprocess


command = [
    'python',
    'preprocess.py',
    'config/LJSpeech/preprocess.yaml'
]

# Execute the command
subprocess.run(command, check=True)


CalledProcessError: Command '['python', 'preprocess.py', 'config/LJSpeech/preprocess.yaml']' returned non-zero exit status 2.