In [11]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import os
import json

# --------------------------
# 1. Configuration
# --------------------------
# Path to your Excel file (update with your actual file name)
model = 'whisper-turbo'
EXCEL_FILE = f"../Data/inferences/{model}.xlsx"

# Output folder for the converted dataset (CSV/JSON)
OUTPUT_DATASET_PATH = os.path.join("..", "Data", "inferences")
os.makedirs(OUTPUT_DATASET_PATH, exist_ok=True)

# Mapping from old type names (from the file path) to new three-letter codes.
type_mapping = {
    "bank-cards": "bcd",
    "car-number": "car",
    "date": "dat",
    "id": "id",         # already short
    "money": "mon",
    "phone": "phn",
    "time": "tim"
}

# --------------------------
# 2. Helper Functions
# --------------------------
def extract_type(file_path):
    """
    Given a file path like "../Data/audio/bank-cards/voices/filename.ogg",
    extract the folder name right after "audio", which indicates the data type.
    """
    norm_path = os.path.normpath(file_path)
    parts = norm_path.split(os.sep)
    try:
        idx = parts.index("train")
        # The folder immediately following "audio" is the type (e.g., "bank-cards")
        return parts[idx + 1]
    except (ValueError, IndexError):
        return None

def extract_audio_name(file_path):
    """
    Extracts the base filename without its extension from the given file path.
    """
    base_name = os.path.basename(file_path)
    audio_name, _ = os.path.splitext(base_name)
    return audio_name

# --------------------------
# 3. Read and Convert Excel Data
# --------------------------
# Read the Excel file into a pandas DataFrame.
# Assume the Excel file contains at least the columns: "file_path" and "transcript".
df = pd.read_excel(EXCEL_FILE)

# Extract the old type from the file path and map it to the new three-letter code.
df['old_type'] = df['file_path'].apply(extract_type)
df['data_type'] = df['old_type'].map(type_mapping)

# Extract the audio name (without extension) from the file path.
df['audio'] = df['file_path'].apply(extract_audio_name)

# Ensure the audio file path is normalized.
df['audio_filepath'] = df['file_path'].apply(lambda x: os.path.normpath(x))

# Rearrange columns for clarity.
cols_order = ['audio', 'audio_filepath', 'data_type', 'transcription']
df = df[cols_order]

# --------------------------
# 4. Save Converted Dataset
# --------------------------
# Save as CSV.
csv_output = os.path.join(OUTPUT_DATASET_PATH, f"{model}.csv")
df.to_csv(csv_output, index=False, encoding="utf-8")

# Save as JSON (records-oriented).
#json_output = os.path.join(OUTPUT_DATASET_PATH, "converted_dataset.json")
#df.to_json(json_output, orient="records", force_ascii=False, indent=2)

print("Conversion complete.")
print("CSV saved to:", csv_output)


Conversion complete.
CSV saved to: ../Data/inferences/whisper-turbo.csv
