In [None]:
"""
Inplace conversion of .wav files to .opus format and updating corresponding JSON files.

This script performs the following tasks:
1. Recursively searches for .wav files in the specified input directory and its subdirectories.
2. Converts each .wav file to .opus format using the ffmpeg command-line tool.
3. Updates the corresponding JSON file by replacing the .wav extension with .opus in the 'audio_path' field.
4. Removes the original .wav file after successful conversion.
5. Utilizes multithreading to process the files concurrently for improved performance.

The conversion process uses the following ffmpeg settings:
- Audio codec: libopus
- Bitrate: 32K
- Application: voip
- Frame duration: 40
- Compression level: Randomly selected between 1 and 9 for each file

Functions:
- compression_level(): Returns a random integer between 1 and 9 for the compression level.
- convert_to_opus(input_path): Converts a .wav file to .opus format using ffmpeg.
- update_json(json_path): Updates the 'audio_path' field in the corresponding JSON file.
- process_file(wav_path): Processes a single .wav file by converting it to .opus and updating the JSON file.
- process_files(input_dir): Processes all .wav files in the specified input directory and its subdirectories.

Usage:
1. Set the INPUT_DIR variable to the desired input directory path.
2. Run the script.

Note:
- The script requires the ffmpeg command-line tool to be installed and accessible from the system's PATH.
- The script assumes that each .wav file has a corresponding JSON file with the same name (except for the extension).
- The script modifies the JSON files in-place and removes the original .wav files after successful conversion.
- The script utilizes multithreading with a maximum of 6 worker threads to process the files concurrently.
- The estimated time left is calculated based on the average processing time per file and the number of files left.

Example:
INPUT_DIR = "R:/dataset/"
process_files(INPUT_DIR)
"""

import os
import subprocess
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random

def compression_level():
    return random.randint(1, 9)

def convert_to_opus(input_path):
    output_path = input_path.replace('.wav', '.opus')
    if not os.path.exists(output_path):  # Check if the .opus file already exists
        print(f"Converting {input_path} to {output_path}")
        cmd = f"ffmpeg -y -i \"{input_path}\" -c:a libopus -b:a 32K -application voip -frame_duration 40 -compression_level {compression_level()} \"{output_path}\""
        subprocess.run(cmd, shell=True)
        os.remove(input_path)  # Remove the original .wav file after successful conversion
    else:
        print(f"{output_path} already exists. Skipping conversion.")

def update_json(json_path):
    if os.path.exists(json_path):
        with open(json_path, 'r', encoding='utf-8') as json_file:  # Ensure reading with utf-8 encoding
            data = json.load(json_file)
            data['audio_path'] = data['audio_path'].replace('.wav', '.opus')
        
        with open(json_path, 'w', encoding='utf-8', newline='\n') as json_file:  # Write updates to the same JSON file
            json.dump(data, json_file, ensure_ascii=False, indent=4)  # ensure_ascii=False to write non-ASCII characters as is

def process_file(wav_path):
    convert_to_opus(wav_path)

    json_path = wav_path.replace('.wav', '.json')
    update_json(json_path)

def process_files(input_dir):
    wav_files = [os.path.join(root, file) for root, dirs, files in os.walk(input_dir) for file in files if file.endswith('.wav')]
    total_files = len(wav_files)
    print(f"Total .wav files to convert: {total_files}")

    start_time = time.time()
    completed_files = 0

    with ThreadPoolExecutor(max_workers=6) as executor:
        future_to_wav = {executor.submit(process_file, wav_path): wav_path for wav_path in wav_files}

        for future in as_completed(future_to_wav):
            try:
                future.result()
                completed_files += 1
                elapsed_time = time.time() - start_time
                files_left = total_files - completed_files
                avg_time_per_file = elapsed_time / completed_files
                est_time_left = avg_time_per_file * files_left / 60  # Calculate estimated time left in minutes

                print(f"Completed {completed_files}/{total_files}. Estimated time left: {est_time_left:.1f} minutes.\n")
            except Exception as exc:
                wav_path = future_to_wav[future]
                print(f"{wav_path} generated an exception: {exc}")

    print("Inplace conversion process completed.")

# INPUT_DIR = "R:/dataset/"
INPUT_DIR = "X:/vits"
process_files(INPUT_DIR)
