<a href="https://colab.research.google.com/github/wolfram-laube/mlpc-project_team-park/blob/wl/pre-trained-v2/fastlane.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# All-in-one Pre-trained Word Tokenizer v2

In [1]:
# Install necessary libraries if not already installed
!pip install transformers librosa torch datasets noisereduce evaluate jiwer pandas accelerate python-Levenshtein



Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting noisereduce
  Downloading noisereduce-3.0.2-py3-none-any.whl (22 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 

In [2]:
data_dir = '/content/dataset'
#data_dir = '../dataset'

## Preproccess

### Load fresh data

In [3]:
import os
import sys
import shutil

# Check if the environment is Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # If in Google Colab
    from google.colab import drive
    import gdown

    # Option 1: Download the file by its public link and expand it to the Colab runtime
    import urllib.request
    import zipfile

    scnwavzip_file_id = '1oI1EsH1krrEPbH9MSZRzLHu-_4p6-njR' # https://drive.google.com/file/d/1oI1EsH1krrEPbH9MSZRzLHu-_4p6-njR/view?usp=sharing
    scnnpyzip_file_id = '1oKgurvIgT93RGkxvxq8AA423VKlEVT7O' # https://drive.google.com/file/d/1oKgurvIgT93RGkxvxq8AA423VKlEVT7O/view?usp=sharing
    wrdwavzip_file_id = '1o1yBqdtqH3tjOHN4GKISJHlY2Qyu_ouX' # https://drive.google.com/file/d/1o1yBqdtqH3tjOHN4GKISJHlY2Qyu_ouX/view?usp=sharing
    wrdnpyzip_file_id = '1o2fj6QAM00zg8YMxsHwcNa2lkIXLXDYs' # https://drive.google.com/file/d/1o2fj6QAM00zg8YMxsHwcNa2lkIXLXDYs/view?usp=sharing
    annotation_file_id = '1xLxget7c5nCkwYt9Ru2RpYi5rMkk_pl0'  # https://drive.google.com/file/d/1xLxget7c5nCkwYt9Ru2RpYi5rMkk_pl0/view?usp=sharing
    scenes_file_id = '1xLgB7-cCz6nReyQbFJJcJGOUKCCbNhCG'  # https://drive.google.com/file/d/1xLgB7-cCz6nReyQbFJJcJGOUKCCbNhCG/view?usp=sharing

    scnwavzip_url = f'https://drive.google.com/uc?id={scnwavzip_file_id}'
    scnnpyzip_url = f'https://drive.google.com/uc?id={scnnpyzip_file_id}'
    wrdwavzip_url = f'https://drive.google.com/uc?id={wrdwavzip_file_id}'
    wrdnpyzip_url = f'https://drive.google.com/uc?id={wrdnpyzip_file_id}'
    annotation_url = f'https://drive.google.com/uc?id={annotation_file_id}'
    scenes_url = f'https://drive.google.com/uc?id={scenes_file_id}'

    scnwavzip_path = '/content/scenes_data.zip'
    scnnpyzip_path = '/content/scenes_feat.zip'
    wrdwavzip_path = '/content/words_data.zip'
    wrdnpyzip_path = '/content/words_feat.zip'
    data_dir = '/content/dataset'
    scenes_dir = f'{data_dir}/scenes'
    words_dir = f'{data_dir}/words'
    scenes_wav_dir = f'{scenes_dir}/wav'
    scenes_npy_dir = f'{scenes_dir}/npy'
    words_wav_dir = f'{data_dir}/words'
    words_npy_dir = f'{data_dir}/words'

    # Download the WAVZIP file
    #urllib.request.urlretrieve(wavzip_url, wavzip_path)
    gdown.download(scnwavzip_url, scnwavzip_path, quiet=False)

    # Unzip the file
    with zipfile.ZipFile(scnwavzip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

    print(f"Scenes training data extracted to {data_dir}")

     # Create the 'scenes/wav' folder structure
    os.makedirs(scenes_wav_dir, exist_ok=True)

    # Copy .wav files to 'scenes/wav'
    extracted_scenes_dir = os.path.join(data_dir, 'mlpc24_speech_commands', 'scenes')
    for root, dirs, files in os.walk(extracted_scenes_dir):
        for file in files:
            if file.endswith('.wav'):
                src_path = os.path.join(root, file)
                dst_path = os.path.join(scenes_wav_dir, file)
                shutil.copy(src_path, dst_path)

    print(f"Scenes training .wav files moved to {scenes_wav_dir}")

    # Download the SCNNPYZIP file
    gdown.download(scnnpyzip_url, scnnpyzip_path, quiet=False)

    # Unzip the file
    with zipfile.ZipFile(scnnpyzip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

    print(f"Scenes training features extracted to {data_dir}")

     # Create the 'scenes/npy' folder structure
    os.makedirs(scenes_npy_dir, exist_ok=True)

    # Copy .npy files to 'scenes/npy'
    extracted_scenes_dir = os.path.join(data_dir, 'development_scenes')
    for root, dirs, files in os.walk(extracted_scenes_dir):
        for file in files:
            if file.endswith('.npy'):
                src_path = os.path.join(root, file)
                dst_path = os.path.join(scenes_npy_dir, file)
                shutil.copy(src_path, dst_path)

    print(f"Scenes training .npy files moved to {scenes_npy_dir}")

    # Download the WRDWAVZIP file
    #urllib.request.urlretrieve(wavzip_url, wavzip_path)
    gdown.download(wrdwavzip_url, wrdwavzip_path, quiet=False)

    # Unzip the file
    with zipfile.ZipFile(wrdwavzip_path, 'r') as zip_ref:
        zip_ref.extractall(words_wav_dir)

    print(f"Words training data extracted to {words_wav_dir}")

    # Download the WRDNPYZIP file
    gdown.download(wrdnpyzip_url, wrdnpyzip_path, quiet=False)

    # Unzip the file
    with zipfile.ZipFile(wrdnpyzip_path, 'r') as zip_ref:
        zip_ref.extractall(words_npy_dir)

    print(f"Words training ,npy files s extracted to {words_npy_dir}")


    # Download the CSV files into the data_dir
    annotation_orig_path = os.path.join(data_dir, 'development_scene_annotations.csv.orig') # Keep a backup copy because it needs fixing
    annotation_path = os.path.join(data_dir, 'development_scene_annotations.csv')
    scenes_path = os.path.join(data_dir, 'development_scenes.csv')

    gdown.download(annotation_url, annotation_orig_path, quiet=False)
    gdown.download(annotation_url, annotation_path, quiet=False)
    gdown.download(scenes_url, scenes_path, quiet=False)

    print(f"CSV files downloaded to {scenes_dir}")

    # Option 2: Mount Google Drive and use the training data
    # Note this really takes some time for preprocessing file by file
    #drive.mount('/content/drive')
    #data_dir = '/content/drive/My Drive/Dropbox/public/mlpc/dataset'

    # Use this option to read from Google Drive instead
    #print(f"Using training data from {data_dir}")
else:
    # If on local machine
    data_dir = '../dataset'
    print(f"Using local training data from {data_dir}")

# Use the data_dir variable as the path to your training data

Downloading...
From (original): https://drive.google.com/uc?id=1oI1EsH1krrEPbH9MSZRzLHu-_4p6-njR
From (redirected): https://drive.google.com/uc?id=1oI1EsH1krrEPbH9MSZRzLHu-_4p6-njR&confirm=t&uuid=e7104295-c34a-4c78-a81d-4cf840e37bb6
To: /content/scenes_data.zip
100%|██████████| 305M/305M [00:11<00:00, 27.7MB/s]


Scenes training data extracted to /content/dataset
Scenes training .wav files moved to /content/dataset/scenes/wav


Downloading...
From (original): https://drive.google.com/uc?id=1oKgurvIgT93RGkxvxq8AA423VKlEVT7O
From (redirected): https://drive.google.com/uc?id=1oKgurvIgT93RGkxvxq8AA423VKlEVT7O&confirm=t&uuid=fba589a4-7a5d-482c-9560-d0b057b7a842
To: /content/scenes_feat.zip
100%|██████████| 422M/422M [00:11<00:00, 37.1MB/s]


Scenes training features extracted to /content/dataset
Scenes training .npy files moved to /content/dataset/scenes/npy


Downloading...
From (original): https://drive.google.com/uc?id=1o1yBqdtqH3tjOHN4GKISJHlY2Qyu_ouX
From (redirected): https://drive.google.com/uc?id=1o1yBqdtqH3tjOHN4GKISJHlY2Qyu_ouX&confirm=t&uuid=ec849287-f0af-4f29-8ada-2a3b1bcff489
To: /content/words_data.zip
100%|██████████| 1.17G/1.17G [00:34<00:00, 34.2MB/s]


Words training data extracted to /content/dataset/words


Downloading...
From (original): https://drive.google.com/uc?id=1o2fj6QAM00zg8YMxsHwcNa2lkIXLXDYs
From (redirected): https://drive.google.com/uc?id=1o2fj6QAM00zg8YMxsHwcNa2lkIXLXDYs&confirm=t&uuid=91262491-0164-4f83-82b2-a5f6ee9b7261
To: /content/words_feat.zip
100%|██████████| 1.51G/1.51G [00:51<00:00, 29.3MB/s]


Words training ,npy files s extracted to /content/dataset/words


Downloading...
From: https://drive.google.com/uc?id=1xLxget7c5nCkwYt9Ru2RpYi5rMkk_pl0
To: /content/dataset/development_scene_annotations.csv.orig
100%|██████████| 70.4k/70.4k [00:00<00:00, 31.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xLxget7c5nCkwYt9Ru2RpYi5rMkk_pl0
To: /content/dataset/development_scene_annotations.csv
100%|██████████| 70.4k/70.4k [00:00<00:00, 54.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xLgB7-cCz6nReyQbFJJcJGOUKCCbNhCG
To: /content/dataset/development_scenes.csv
100%|██████████| 29.5k/29.5k [00:00<00:00, 24.3MB/s]

CSV files downloaded to /content/dataset/scenes





### Determine CPU/GPU

In [4]:
# Function to check if GPU is available
#def is_gpu_available():
#    try:
#        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#        return result.returncode == 0
#    except FileNotFoundError:
#        return False

def is_gpu_available():
    try:
        import torch
        is_gpu = torch.cuda.is_available()
        print(f'GPU available: {is_gpu}')
        return is_gpu
    except ImportError as ie:
        print("No GPU support", ie)
        pass

    try:
        import tensorflow as tf
        is_gpu =  tf.config.list_physical_devices('GPU') != []
        print(f'GPU available: {is_gpu}')
        return is_gpu
    except ImportError as ie:
        print("No GPU support", ie)
        pass

    print("No GPU support found")
    return False

is_gpu_available()

GPU available: True


True

### Fix erreneous metadata

#### Before

In [5]:
import pandas as pd

# Load the CSV files
scene_annotations_df = pd.read_csv(f'{data_dir}/development_scene_annotations.csv')
scenes_df = pd.read_csv(f'{data_dir}/development_scenes.csv')

# Check the head of the dataframes to understand their structure
print(scene_annotations_df.head())
print(scenes_df.head())

# Check the distribution of labels in the annotations CSV
label_distribution_annotations = scene_annotations_df['command'].value_counts()
print("Label Distribution in development_scene_annotations.csv:")
print(label_distribution_annotations)

# Check the distribution of speaker IDs in the scenes CSV
label_distribution_scenes = scenes_df['speaker_id'].value_counts()
print("Label Distribution in development_scenes.csv:")
print(label_distribution_scenes)


                        filename         command     start       end
0         2_speech_true_Ofen_aus        Ofen aus  11.25230  12.07747
1         3_speech_true_Radio_an  Staubsauger an  21.48040  23.18083
2         4_speech_true_Alarm_an        Alarm an  14.45720  16.08301
3        9_speech_true_Radio_aus  Staubsauger an   3.67909   5.63126
4  11_speech_false_Fernseher_aus  Staubsauger an  10.57850  11.67886
                        filename  speaker_id
0         2_speech_true_Ofen_aus         132
1         3_speech_true_Radio_an         132
2         4_speech_true_Alarm_an         132
3        9_speech_true_Radio_aus         132
4  11_speech_false_Fernseher_aus         132
Label Distribution in development_scene_annotations.csv:
command
Staubsauger an     288
Licht aus           77
Licht an            64
Fernseher an        56
Alarm an            56
Heizung an          55
Heizung aus         54
Radio aus           53
Radio an            52
Ofen aus            49
Alarm aus           4

#### Fix

In [6]:
import os
import re
import shutil
import pandas as pd

# Paths to the original and working copy files
original_file_path = f'{data_dir}/development_scene_annotations.csv.orig'
working_copy_path = f'{data_dir}/development_scene_annotations.csv.0'
corrected_file_path = f'{data_dir}/development_scene_annotations.csv'

# Step 1: Create a working copy of the original file
shutil.copy(original_file_path, working_copy_path)

# Step 2: Load the working copy into a DataFrame
df = pd.read_csv(working_copy_path)

# Define the pattern to parse the filename
filename_pattern = re.compile(r'(\d+)_speech_(true|false)_((?:[a-zA-ZäöüÄÖÜß]+_(?:an|aus)_?)+)', re.UNICODE)

# Function to parse filename and extract commands
def parse_filename(filename):
    match = filename_pattern.match(filename)
    if not match:
        return []

    commands_str = match.group(3)
    commands = commands_str.split('_')

    command_list = []
    for i in range(0, len(commands), 2):
        command_list.append(f"{commands[i]} {commands[i+1]}")

    return command_list

# Parse the commands from filenames and add to the DataFrame
df['parsed_commands'] = df['filename'].apply(parse_filename)

# Step 3: Group by filename and sort by start time
grouped = df.groupby('filename').apply(lambda x: x.sort_values(by='start')).reset_index(drop=True)

# Step 4: Assign the correct labels based on the order of commands in the filename
def assign_labels(group):
    commands = group['parsed_commands'].iloc[0]  # get the parsed commands from the first row
    group = group.reset_index(drop=True)
    for i in range(len(group)):
        if i < len(commands):
            group.at[i, 'command'] = commands[i]
        else:
            print(f"Warning: More segments than commands in {group['filename'].iloc[0]}")
    return group

# Apply the label assignment function
corrected_df = grouped.groupby('filename').apply(assign_labels).reset_index(drop=True)

# Drop the temporary column
corrected_df = corrected_df.drop(columns=['parsed_commands'])

# Step 5: Save the corrected DataFrame to a new CSV file
corrected_df.to_csv(corrected_file_path, index=False)

# Verify the saved corrections
print("Label corrections applied and saved successfully.")
print(corrected_df.head())


Label corrections applied and saved successfully.
                        filename       command     start       end
0    1003_speech_false_Licht_aus     Licht aus  12.20090  13.57599
1       1008_speech_true_Ofen_an       Ofen an   6.90112   8.52638
2      1010_speech_true_Radio_an      Radio an  13.03100  14.03146
3  1011_speech_true_Fernseher_an  Fernseher an  14.11030  15.36121
4   1012_speech_true_Heizung_aus   Heizung aus  11.20520  12.70590


#### After

In [7]:
import pandas as pd

# Load the CSV files
scene_annotations_df = pd.read_csv(f'{data_dir}/development_scene_annotations.csv')
scenes_df = pd.read_csv(f'{data_dir}/development_scenes.csv')

# Check the head of the dataframes to understand their structure
print(scene_annotations_df.head())
print(scenes_df.head())

# Check the distribution of labels in the annotations CSV
label_distribution_annotations = scene_annotations_df['command'].value_counts()
print("Label Distribution in development_scene_annotations.csv:")
print(label_distribution_annotations)

# Check the distribution of speaker IDs in the scenes CSV
label_distribution_scenes = scenes_df['speaker_id'].value_counts()
print("Label Distribution in development_scenes.csv:")
print(label_distribution_scenes)


                        filename       command     start       end
0    1003_speech_false_Licht_aus     Licht aus  12.20090  13.57599
1       1008_speech_true_Ofen_an       Ofen an   6.90112   8.52638
2      1010_speech_true_Radio_an      Radio an  13.03100  14.03146
3  1011_speech_true_Fernseher_an  Fernseher an  14.11030  15.36121
4   1012_speech_true_Heizung_aus   Heizung aus  11.20520  12.70590
                        filename  speaker_id
0         2_speech_true_Ofen_aus         132
1         3_speech_true_Radio_an         132
2         4_speech_true_Alarm_an         132
3        9_speech_true_Radio_aus         132
4  11_speech_false_Fernseher_aus         132
Label Distribution in development_scene_annotations.csv:
command
Licht aus          86
Licht an           78
Heizung an         76
Fernseher an       74
Radio aus          69
Heizung aus        67
Alarm an           66
Radio an           65
Lüftung aus        64
Ofen aus           64
Lüftung an         63
Ofen an            63

### Preprocess audio data

In [8]:
import os
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
import random
from IPython.display import Audio
from sklearn.decomposition import FastICA

# Function to apply ICA on audio segments
def apply_ica(segment, sr):
    ica = FastICA(n_components=1, whiten='arbitrary-variance')  # Explicitly set whiten parameter
    segment_reshaped = segment.reshape(-1, 1)
    segment_ica = ica.fit_transform(segment_reshaped).flatten()
    return segment_ica

# Function to preprocess segments and optionally save to the filesystem
def preprocess_and_save_segments(scenes_dir, annotations_path, save_dir=None, save_to_filesystem=False, apply_ica_flag=False):
    # Load the annotations
    annotations_df = pd.read_csv(annotations_path)

    # Ensure the save directory exists if saving to filesystem
    if save_to_filesystem and save_dir is not None:
        os.makedirs(save_dir, exist_ok=True)

    preprocessed_segments = []

    for index, row in annotations_df.iterrows():
        filename = row['filename']
        command = row['command']
        start = row['start']
        end = row['end']

        # Load the audio file
        file_path = os.path.join(scenes_dir, f"{filename}.wav")
        y, sr = librosa.load(file_path, sr=None)

        # Extract the segment
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]

        # Normalize the segment
        segment = librosa.util.normalize(segment)

        # Apply ICA if the flag is set
        if apply_ica_flag:
            segment = apply_ica(segment, sr)

        # Add the segment to the list
        preprocessed_segments.append((filename, command, segment, sr))

        # Save the segment to the filesystem if required
        if save_to_filesystem and save_dir is not None:
            save_path = os.path.join(save_dir, f"{filename}_{start}_{end}.wav")
            sf.write(save_path, segment, sr)

    return preprocessed_segments

# Function to play a random segment from preprocessed segments
def play_random_segment(preprocessed_segments):
    # Select a random segment
    random_segment = random.choice(preprocessed_segments)

    filename, command, audio_data, sample_rate = random_segment

    # Print the command and play the audio segment
    print(f"Filename: {filename}")
    print(f"Command: {command}")

    return Audio(audio_data, rate=sample_rate)

# Function to play a random segment from the filesystem
def play_random_segment_from_filesystem(save_dir, annotations_path):
    # List all the preprocessed segment files
    segment_files = [f for f in os.listdir(save_dir) if f.endswith('.wav')]

    # Select a random segment file
    random_segment_file = random.choice(segment_files)
    random_segment_path = os.path.join(save_dir, random_segment_file)

    # Extract start and end times from the file name
    filename_parts = random_segment_file.split('_')
    filename = '_'.join(filename_parts[:-2])
    start_time = float(filename_parts[-2])
    end_time = float(filename_parts[-1].replace('.wav', ''))

    # Find the command in the annotations
    annotations_df = pd.read_csv(annotations_path)
    command_row = annotations_df[
        (annotations_df['filename'] == filename) &
        (annotations_df['start'] == start_time) &
        (annotations_df['end'] == end_time)
    ]

    if command_row.empty:
        print(f"No matching annotation found for {random_segment_file}")
        return

    command = command_row.iloc[0]['command']

    # Load the audio segment
    y, sr = librosa.load(random_segment_path, sr=None)

    # Print the command and play the audio segment
    print(f"Filename: {filename}")
    print(f"Command: {command}")

    return Audio(y, rate=sr)

# Example usage
scenes_dir = f'{data_dir}/scenes/wav'
annotations_path = f'{data_dir}/development_scene_annotations.csv'
save_dir = f'{data_dir}/clipped_commands'

# Preprocess segments and save to filesystem with optional ICA
preprocessed_segments = preprocess_and_save_segments(scenes_dir, annotations_path, save_dir, save_to_filesystem=True, apply_ica_flag=True)

# Play a random segment from memory
audio_memory = play_random_segment(preprocessed_segments)
display(audio_memory)

# Play a random segment from filesystem
audio_filesystem = play_random_segment_from_filesystem(save_dir, annotations_path)
display(audio_filesystem)


Filename: 287_speech_false_Fernseher_an_Heizung_aus_Licht_an_Lüftung_aus
Command: Lüftung aus


Filename: 366_speech_true_Heizung_an_Licht_aus
Command: Licht aus


## Main

### Libraries

### File Parsing

In [28]:
# audio_parsing_utils.py

import re
import unicodedata
import logging
from collections import defaultdict

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the pattern to parse clipped command filenames
clipped_command_pattern = re.compile(
    r'(\d+)_speech_(true|false)_((?:[a-zA-ZäöüÄÖÜß]+_(?:an|aus)_?)+)_(\d+\.\d+)_(\d+\.\d+)\.wav', re.UNICODE
)

# Define the pattern to parse full scene filenames
full_scene_pattern = re.compile(
    r'(\d+)_speech_(true|false)_((?:[a-zA-ZäöüÄÖÜß]+_(?:an|aus)_?)+)\.wav', re.UNICODE
)

def normalize_unicode(text):
    return unicodedata.normalize('NFC', text)

# Function to parse clipped command filenames to extract commands, start time, and end time
def parse_clipped_command_filename(filename):
    logger.debug(f"Attempting to parse filename: {filename}")
    filename = normalize_unicode(filename)
    match = clipped_command_pattern.match(filename)
    if not match:
        logger.error(f"Filename {filename} does not match the expected pattern.")
        raise ValueError(f"Filename {filename} does not match the expected pattern.")

    # Extract command string and timestamps
    commands_str = match.group(3)
    start_time = float(match.group(4))
    end_time = float(match.group(5))

    # Split and format commands
    commands = commands_str.split('_')
    command_list = []
    for i in range(0, len(commands), 2):
        command_list.append(f"{commands[i]} {commands[i+1]}")

    logger.debug(f"Parsed filename {filename}: file_id={match.group(1)}, speech_flag={match.group(2)}, command_list={command_list}, start_time={start_time}, end_time={end_time}")
    return match.group(1), match.group(2), command_list, start_time, end_time

# Function to parse full scene filenames to extract commands
def parse_full_scene_filename(filename):
    logger.debug(f"Attempting to parse filename: {filename}")
    filename = normalize_unicode(filename)
    match = full_scene_pattern.match(filename)
    if not match:
        logger.error(f"Filename {filename} does not match the expected pattern.")
        raise ValueError(f"Filename {filename} does not match the expected pattern.")

    # Extract command string
    file_id = match.group(1)
    speech_flag = match.group(2)
    commands_str = match.group(3)

    # Split and format commands
    commands = commands_str.split('_')
    command_list = []
    for i in range(0, len(commands), 2):
        command_list.append(f"{commands[i]} {commands[i+1]}")

    logger.debug(f"Parsed filename {filename}: file_id={file_id}, speech_flag={speech_flag}, command_list={command_list}")
    return file_id, speech_flag, command_list


### Training

In [9]:
import os
import re
import torch
import librosa
import logging
from torch.utils.data import DataLoader, Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from tqdm import tqdm
import numpy as np

# Configure logging to output to console
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
for handler in logger.handlers[:]:
    logger.removeHandler(handler)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Load pre-trained tokenizer and model
def load_model_and_tokenizer(model_name="facebook/wav2vec2-large-xlsr-53-german"):
    logger.info("Loading model and tokenizer...")
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
    logger.info("Model and tokenizer loaded successfully.")
    return processor, model

# Extract labels from filenames
def extract_labels_from_filename(filename):
    match = re.search(r'speech_true_(.*)\.wav', filename)
    if match:
        words = match.group(1).split('_')
        return ' '.join(words)
    return ''

# Dataset class with data augmentation
class AudioDataset(Dataset):
    def __init__(self, audio_files, processor, augment=False):
        self.audio_files = audio_files
        self.processor = processor
        self.augment = augment

    def __len__(self):
        return len(self.audio_files)

    def augment_audio(self, audio, sr):
        if np.random.rand() > 0.5:
            audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=np.random.uniform(-2, 2), bins_per_octave=24)
        if np.random.rand() > 0.5:
            audio = librosa.effects.time_stretch(audio, rate=np.random.uniform(0.8, 1.2))
        if np.random.rand() > 0.5:
            audio = audio + 0.005 * np.random.randn(len(audio))
        return audio

    def __getitem__(self, idx):
        file_path, audio, sr = self.audio_files[idx]
        if self.augment:
            audio = self.augment_audio(audio, sr)
        inputs = self.processor(audio, return_tensors="pt", padding="longest", sampling_rate=sr)
        label = extract_labels_from_filename(os.path.basename(file_path))
        label_ids = self.processor.tokenizer(label, return_tensors="pt").input_ids
        return inputs.input_values.squeeze(), label_ids.squeeze()

# Collate function to handle padding in DataLoader
def collate_fn(batch):
    input_values = [item[0] for item in batch]
    label_ids = [item[1] for item in batch]

    input_values = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True, padding_value=0)
    label_ids = torch.nn.utils.rnn.pad_sequence(label_ids, batch_first=True, padding_value=-100)

    return input_values, label_ids

# Load audio files
def load_audio_files(directory):
    audio_data = []
    logger.info(f"Loading audio files from {directory}...")
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                y, sr = librosa.load(file_path, sr=16000)  # Ensuring consistent sampling rate
                audio_data.append((file_path, y, sr))
    logger.info(f"Loaded {len(audio_data)} audio files from {directory}.")
    return audio_data

# Fine-tuning function
def fine_tune_model(model, processor, words_loader, num_epochs=10, lr=1e-5):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        epoch_loss = 0
        logger.info(f"Starting fine-tuning epoch {epoch + 1}/{num_epochs}...")
        with tqdm(total=len(words_loader), desc=f"Fine-tuning Epoch {epoch + 1}") as pbar:
            for input_values, label_ids in words_loader:
                input_values, label_ids = input_values.to(device), label_ids.to(device)

                optimizer.zero_grad()
                outputs = model(input_values)
                logits = outputs.logits

                # Compute lengths for CTC loss
                input_lengths = torch.full((logits.shape[0],), logits.shape[1], dtype=torch.long).to(device)
                label_lengths = torch.sum(label_ids != -100, dim=1).to(device)

                loss = torch.nn.CTCLoss()(logits.transpose(0, 1), label_ids, input_lengths, label_lengths)
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                pbar.set_postfix({'loss': loss.item()})
                pbar.update(1)

        logger.info(f"Fine-tuning epoch {epoch + 1} completed. Loss: {epoch_loss / len(words_loader):.4f}")

# Training function with validation and model checkpointing
def train_model(model, processor, train_loader, val_loader, num_epochs=15, lr=1e-5, accumulation_steps=4):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        epoch_loss = 0
        logger.info(f"Starting epoch {epoch + 1}/{num_epochs}...")
        with tqdm(total=len(train_loader), desc=f"Training Epoch {epoch + 1}") as pbar:
            optimizer.zero_grad()
            for i, (input_values, label_ids) in enumerate(train_loader):
                input_values, label_ids = input_values.to(device), label_ids.to(device)

                outputs = model(input_values)
                logits = outputs.logits

                # Compute lengths for CTC loss
                input_lengths = torch.full((logits.shape[0],), logits.shape[1], dtype=torch.long).to(device)
                label_lengths = torch.sum(label_ids != -100, dim=1).to(device)

                # Debugging output
                logger.debug(f"input_lengths: {input_lengths}")
                logger.debug(f"label_lengths: {label_lengths}")
                logger.debug(f"logits: {logits.shape}")
                logger.debug(f"label_ids: {label_ids}")

                loss = torch.nn.CTCLoss()(logits.transpose(0, 1), label_ids, input_lengths, label_lengths)
                loss.backward()

                if (i + 1) % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                epoch_loss += loss.item()
                pbar.set_postfix({'loss': loss.item()})
                pbar.update(1)

        val_loss = validate_model(model, val_loader)
        logger.info(f"Epoch {epoch + 1} completed. Training Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            logger.info("Saving the new best model...")
            model.save_pretrained("fine_tuned_wav2vec2")
            processor.save_pretrained("fine_tuned_wav2vec2")

# Validation function
def validate_model(model, val_loader):
    model.eval()
    val_loss = 0
    logger.info("Starting validation...")
    with tqdm(total=len(val_loader), desc="Validation") as pbar:
        with torch.no_grad():
            for input_values, label_ids in val_loader:
                input_values, label_ids = input_values.to(device), label_ids.to(device)

                outputs = model(input_values)
                logits = outputs.logits

                # Compute lengths for CTC loss
                input_lengths = torch.full((logits.shape[0],), logits.shape[1], dtype=torch.long).to(device)
                label_lengths = torch.sum(label_ids != -100, dim=1).to(device)

                # Debugging output
                logger.debug(f"Validation input_lengths: {input_lengths}")
                logger.debug(f"Validation label_lengths: {label_lengths}")
                logger.debug(f"Validation logits: {logits.shape}")
                logger.debug(f"Validation label_ids: {label_ids}")

                loss = torch.nn.CTCLoss()(logits.transpose(0, 1), label_ids, input_lengths, label_lengths)
                val_loss += loss.item()
                pbar.set_postfix({'loss': loss.item()})
                pbar.update(1)
    model.train()
    return val_loss / len(val_loader)

# Inference function with timestamps
def infer_with_timestamps(model, processor, audio_file):
    y, sr = librosa.load(audio_file, sr=16000)  # Ensuring consistent sampling rate
    inputs = processor(y, return_tensors="pt", padding="longest", sampling_rate=sr).to(device)

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids.cpu())[0]

    # Get the frame timestamps
    frame_duration = model.config.inputs_to_logits_ratio / sr
    frame_timestamps = [i * frame_duration for i in range(logits.shape[1])]

    # Decode token ids to words with timestamps
    word_timestamps = []
    current_word = ""
    current_word_start = None

    for i, token_id in enumerate(predicted_ids[0].cpu()):
        token = processor.decode([token_id])
        if token.strip() != "":
            if current_word == "":
                current_word_start = frame_timestamps[i]
            current_word += token
        else:
            if current_word != "":
                word_timestamps.append((current_word, current_word_start, frame_timestamps[i]))
                current_word = ""
                current_word_start = None

    # Handle last word if any
    if current_word != "":
        word_timestamps.append((current_word, current_word_start, frame_timestamps[-1]))

    return transcription, word_timestamps

# Main execution
if __name__ == "__main__":
    #data_dir = '/content/dataset'
    scenes_path = f'{data_dir}/scenes/wav'
    words_path = f'{data_dir}/words'

    scenes_audio = load_audio_files(scenes_path)[:50]  # Increase subset size for more data
    words_audio = load_audio_files(words_path)[:50]  # Increase subset size for more data

    processor, model = load_model_and_tokenizer()

    words_dataset = AudioDataset(words_audio, processor, augment=True)
    words_loader = DataLoader(words_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)  # Larger batch size

    # Fine-tune on words dataset
    #fine_tune_model(model, processor, words_loader, num_epochs=3, lr=1e-5)

    val_split = int(len(scenes_audio) * 0.2)
    train_dataset = AudioDataset(scenes_audio[val_split:], processor, augment=True)
    val_dataset = AudioDataset(scenes_audio[:val_split], processor)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)  # Larger batch size
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

    # Train on scenes dataset
    train_model(model, processor, train_loader, val_loader, num_epochs=15, lr=1e-5)

    unseen_audio_file = f'{scenes_path}/98_speech_true_Alarm_aus_Lüftung_aus_Heizung_aus.wav'
    transcription, word_timestamps = infer_with_timestamps(model, processor, unseen_audio_file)

    logger.info("Transcription: " + transcription)
    logger.info("Word Timestamps:")
    for word, start, end in word_timestamps:
        logger.info(f"Word: {word}, Start: {start:.2f}s, End: {end:.2f}s")


2024-06-17 03:39:38,324 - INFO - Using device: cuda
INFO:__main__:Using device: cuda
2024-06-17 03:39:38,329 - INFO - Loading audio files from /content/dataset/scenes/wav...
INFO:__main__:Loading audio files from /content/dataset/scenes/wav...
2024-06-17 03:39:39,900 - INFO - Loaded 814 audio files from /content/dataset/scenes/wav.
INFO:__main__:Loaded 814 audio files from /content/dataset/scenes/wav.
2024-06-17 03:39:39,953 - INFO - Loading audio files from /content/dataset/words...
INFO:__main__:Loading audio files from /content/dataset/words...
2024-06-17 03:39:54,827 - INFO - Loaded 45296 audio files from /content/dataset/words.
INFO:__main__:Loaded 45296 audio files from /content/dataset/words.
2024-06-17 03:39:54,850 - INFO - Loading model and tokenizer...
INFO:__main__:Loading model and tokenizer...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), s

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/378 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53-german were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53-german and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should prob

In [10]:
!pip install python-Levenshtein

import re
import Levenshtein

# Preprocess the transcription
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Calculate Levenshtein distance
def calculate_distance(transcription, command):
    return Levenshtein.distance(transcription, command)

# Determine matches at character level
def find_matches_char_level(transcription, commands, threshold=5):
    transcription = preprocess(transcription)
    matches = []

    for command in commands:
        command = preprocess(command)
        len_command = len(command)

        for i in range(len(transcription) - len_command + 1):
            segment = transcription[i:i+len_command]
            distance = calculate_distance(segment, command)
            if distance <= threshold:
                matches.append((segment, command, distance))

    return matches

# Evaluate results
def evaluate_matches(matches, commands):
    true_positives = len(matches)
    false_negatives = len(commands) - true_positives
    false_positives = len(matches) - true_positives

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

# Main execution
transcription = "j alatms de nergi ein kfstoff kasis as  ja i gäsige ih schttig nsisce es as de simös ne allaam auslüftun s j neig   asu tzung aus"
commands = ["alarm aus", "lüftung aus", "heizung aus"]

matches = find_matches_char_level(transcription, commands)
precision, recall, f1_score = evaluate_matches(matches, commands)

print("Matches:", matches)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Matches: [('alatms de', 'alarm aus', 5), (' allaam a', 'alarm aus', 5), ('allaam au', 'alarm aus', 3), ('llaam aus', 'alarm aus', 2), ('laam ausl', 'alarm aus', 3), ('aam auslü', 'alarm aus', 4), ('tzung aus', 'alarm aus', 5), ('slüftun s j', 'lüftung aus', 5), ('lüftun s j ', 'lüftung aus', 5), ('su tzung au', 'lüftung aus', 5), ('u tzung aus', 'lüftung aus', 4), ('su tzung au', 'heizung aus', 5), ('u tzung aus', 'heizung aus', 3)]
Precision: 1.0
Recall: 4.333333333333333
F1 Score: 1.625


#### Single file prediction, wrong timestamps

In [11]:
import csv
import os
import librosa
import numpy as np
import soundfile as sf
from Levenshtein import distance as levenshtein_distance
import torch

# Assuming you have the following functions from previous code
# - infer_with_timestamps(model, processor, audio_file)
# - calculate_f1_score(predictions, ground_truths) and other related functions for matching

def get_best_matches(transcription, word_timestamps, target_commands, threshold=4):
    matches = []
    transcription_words = transcription.split()

    for command in target_commands:
        command_words = command.split()
        command_length = len(command_words)
        min_distance = float('inf')
        best_match = None
        best_start_time = None
        best_end_time = None

        print(f"Checking command: {command}")

        for i in range(len(transcription_words) - command_length + 1):
            segment = transcription_words[i:i + command_length]
            segment_str = " ".join(segment)
            dist = levenshtein_distance(segment_str, command)
            if dist < min_distance:
                min_distance = dist
                best_match = command
                best_start_time = word_timestamps[i][1]
                best_end_time = word_timestamps[i + command_length - 1][2]

            print(f"Segment: {segment_str}, Distance: {dist}, Start: {word_timestamps[i][1]:.2f}, End: {word_timestamps[i + command_length - 1][2]:.2f}")

        if min_distance < threshold:
            matches.append((best_match, best_start_time, best_end_time, min_distance))
    return matches

def save_predictions_to_csv(matches, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "command", "timestamp"])
        for match in matches:
            command, start_time, end_time, _ = match
            timestamp = (start_time + end_time) / 2
            writer.writerow(["98_speech_true_Alarm_aus_Lüftung_aus_Heizung_aus", command, timestamp])

def print_matches(matches):
    print(f"{'Command':<20} {'Start Time':<10} {'End Time':<10} {'Distance':<10}")
    print("-" * 50)
    for match in matches:
        command, start_time, end_time, distance = match
        print(f"{command:<20} {start_time:<10.2f} {end_time:<10.2f} {distance:<10}")

# List of possible commands
target_commands = [
    "alarm aus", "lüftung aus", "heizung aus", "licht an", "licht aus", "radio an", "radio aus",
    "tür auf", "tür zu", "fenster auf", "fenster zu", "rollladen hoch", "rollladen runter",
    "musik an", "musik aus", "tv an", "tv aus", "fernseher an", "fernseher aus", "computer an",
    "computer aus", "klima an", "klima aus", "kaffeemaschine an", "kaffeemaschine aus",
    "ventilator an", "ventilator aus", "staubsauger an", "staubsauger aus", "ofen an", "ofen aus"
]

# Inference function with timestamps
def infer_with_timestamps(model, processor, audio_file, sr):
    y, sr = librosa.load(audio_file, sr=sr)  # Ensuring consistent sampling rate
    inputs = processor(y, return_tensors="pt", padding="longest", sampling_rate=sr).to(device)

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids.cpu())[0]

    # Get the frame timestamps
    frame_duration = model.config.inputs_to_logits_ratio / sr
    frame_timestamps = [i * frame_duration for i in range(logits.shape[1])]

    # Verify if the total length of frame_timestamps matches the length of the audio
    audio_duration = len(y) / sr
    predicted_duration = frame_timestamps[-1] if frame_timestamps else 0

    print(f"Audio Duration: {audio_duration:.2f} seconds")
    print(f"Predicted Duration: {predicted_duration:.2f} seconds")

    # Decode token ids to words with timestamps
    word_timestamps = []
    current_word = ""
    current_word_start = None

    for i, token_id in enumerate(predicted_ids[0].cpu()):
        token = processor.decode([token_id])
        if token.strip() != "":
            if current_word == "":
                current_word_start = frame_timestamps[i]
            current_word += token
        else:
            if current_word != "":
                word_timestamps.append((current_word, current_word_start, frame_timestamps[i]))
                current_word = ""
                current_word_start = None

    # Handle last word if any
    if current_word != "":
        word_timestamps.append((current_word, current_word_start, frame_timestamps[-1]))

    # Log word timestamps for debugging
    for word, start, end in word_timestamps:
        print(f"Word: {word}, Start: {start:.2f}s, End: {end:.2f}s")
    print (f"Transcription: {transcription}")
    return transcription, word_timestamps

# Example usage
if __name__ == "__main__":
    data_dir = '/content/dataset'
    scenes_path = f'{data_dir}/scenes/wav'
    unseen_audio_file = f'{scenes_path}/98_speech_true_Alarm_aus_Lüftung_aus_Heizung_aus.wav'

    # Check the actual sampling rate
    file_info = sf.info(unseen_audio_file)
    actual_sr = file_info.samplerate
    print(f"Sample Rate: {actual_sr} Hz")

    # Load the audio file to get its duration
    y, sr = librosa.load(unseen_audio_file, sr=actual_sr)
    audio_duration = librosa.get_duration(y=y, sr=sr)
    print(f"Audio Duration: {audio_duration:.2f} seconds")

    transcription, word_ttimestamps = infer_with_timestamps(model, processor, unseen_audio_file, sr=actual_sr)
    matches = get_best_matches(transcription, word_ttimestamps, target_commands, threshold=4)

    print_matches(matches)
    save_predictions_to_csv(matches, "predictions.csv")


Sample Rate: 16000 Hz
Audio Duration: 23.21 seconds
Audio Duration: 23.21 seconds
Predicted Duration: 23.18 seconds
Word: j, Start: 1.02s, End: 1.04s
Word: a, Start: 1.74s, End: 1.76s
Word: t, Start: 1.92s, End: 1.94s
Word: m, Start: 1.98s, End: 2.00s
Word: s, Start: 2.08s, End: 2.10s
Word: o, Start: 2.14s, End: 2.16s
Word: e, Start: 2.24s, End: 2.26s
Word: d, Start: 2.32s, End: 2.34s
Word: s, Start: 2.52s, End: 2.54s
Word: e, Start: 2.68s, End: 2.70s
Word: ne, Start: 2.78s, End: 2.82s
Word: r, Start: 2.84s, End: 2.86s
Word: d, Start: 2.90s, End: 2.92s
Word: i, Start: 3.04s, End: 3.06s
Word: eii, Start: 3.58s, End: 3.64s
Word: ee, Start: 3.70s, End: 3.74s
Word: kk, Start: 3.84s, End: 3.88s
Word: r, Start: 3.90s, End: 3.92s
Word: a, Start: 3.94s, End: 3.96s
Word: f, Start: 4.04s, End: 4.06s
Word: ss, Start: 4.08s, End: 4.12s
Word: t, Start: 4.18s, End: 4.20s
Word: o, Start: 4.22s, End: 4.24s
Word: ff, Start: 4.28s, End: 4.32s
Word: f, Start: 4.38s, End: 4.40s
Word: k, Start: 4.68s, End:

#### Single file prediction, corrected

In [22]:
import csv
import os
import librosa
import numpy as np
import soundfile as sf
from Levenshtein import distance as levenshtein_distance
import torch
import re

def preprocess_phonemes(phonemes):
    preprocessed = []
    for phoneme, start, end in phonemes:
        processed_phoneme = re.sub(r'(.)\1+', r'\1', phoneme)  # Remove double letters
        preprocessed.append((processed_phoneme, start, end))
    return preprocessed

def create_word_to_phoneme_mapping(transcription, words_timestamps):
    transcription_words = transcription.split()
    word_to_phoneme_mapping = []
    phoneme_index = 0

    for word in transcription_words:
        phonemes = []
        while phoneme_index < len(words_timestamps):
            phoneme, start, end = words_timestamps[phoneme_index]
            phonemes.append((phoneme, start, end))
            phoneme_index += 1
            if ''.join(p[0] for p in phonemes) == word:
                break
        word_to_phoneme_mapping.append((word, phonemes))
    return word_to_phoneme_mapping

def find_segment_timestamps(segment, word_start_end):
    segment_words = segment.split()
    segment_length = len(segment_words)
    print(f"Finding segment: {segment}")

    for i in range(len(word_start_end) - segment_length + 1):
        match = True
        for j in range(segment_length):
            if word_start_end[i + j][0] != segment_words[j]:
                match = False
                break
        if match:
            start_time = word_start_end[i][1][0][1]
            end_time_index = i + segment_length - 1
            if end_time_index >= len(word_start_end):
                print(f"Index out of range: end_time_index={end_time_index}, len(word_start_end)={len(word_start_end)}")
                continue
            end_time = word_start_end[end_time_index][1][-1][2]
            print(f"Segment '{segment}' found: Start={start_time}, End={end_time}")
            return start_time, end_time

    print(f"Segment '{segment}' not found.")
    return None, None

def get_best_matches(transcription, word_timestamps, target_commands, threshold=4):
    matches = []
    transcription_words = transcription.split()
    preprocessed_phonemes = preprocess_phonemes(word_timestamps)
    word_start_end = create_word_to_phoneme_mapping(transcription, preprocessed_phonemes)

    for command in target_commands:
        min_distance = float('inf')
        best_match = None
        best_start_time = None
        best_end_time = None

        for i in range(len(transcription_words) - len(command.split()) + 1):
            segment = transcription_words[i:i + len(command.split())]
            segment_str = " ".join(segment)
            dist = levenshtein_distance(segment_str, command)
            if dist < min_distance:
                min_distance = dist
                best_start_time, best_end_time = find_segment_timestamps(segment_str, word_start_end)
                best_match = command

        if min_distance < threshold and best_start_time is not None and best_end_time is not None:
            matches.append((best_match, best_start_time, best_end_time, min_distance))
    return matches

def save_predictions_to_csv(matches, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "command", "timestamp"])
        for match in matches:
            command, start_time, end_time, _ = match
            timestamp = (start_time + end_time) / 2
            writer.writerow(["98_speech_true_Alarm_aus_Lüftung_aus_Heizung_aus", command, timestamp])

def print_matches(matches):
    print(f"{'Command':<20} {'Start Time':<10} {'End Time':<10} {'Distance':<10}")
    print("-" * 50)
    for match in matches:
        command, start_time, end_time, distance = match
        print(f"{command:<20} {start_time:<10.2f} {end_time:<10.2f} {distance:<10}")

def infer_with_timestamps(model, processor, audio_file, sr):
    y, sr = librosa.load(audio_file, sr=sr)  # Ensuring consistent sampling rate
    inputs = processor(y, return_tensors="pt", padding="longest", sampling_rate=sr).to(device)

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids.cpu())[0]

    frame_duration = model.config.inputs_to_logits_ratio / sr
    frame_timestamps = [i * frame_duration for i in range(logits.shape[1])]

    audio_duration = len(y) / sr
    predicted_duration = frame_timestamps[-1] if frame_timestamps else 0

    print(f"Audio Duration: {audio_duration:.2f} seconds")
    print(f"Predicted Duration: {predicted_duration:.2f} seconds")

    word_timestamps = []
    current_word = ""
    current_word_start = None

    for i, token_id in enumerate(predicted_ids[0].cpu()):
        token = processor.decode([token_id])
        if token.strip() != "":
            if current_word == "":
                current_word_start = frame_timestamps[i]
            current_word += token
        else:
            if current_word != "":
                word_timestamps.append((current_word, current_word_start, frame_timestamps[i]))
                current_word = ""
                current_word_start = None

    if current_word != "":
        word_ttimestamps.append((current_word, current_word_start, frame_timestamps[-1]))

    for word, start, end in word_timestamps:
        print(f"Word: {word}, Start: {start:.2f}s, End: {end:.2f}s")
    print(f"Transcription: {transcription}")
    return transcription, word_timestamps

if __name__ == "__main__":
    data_dir = '/content/dataset'
    scenes_path = f'{data_dir}/scenes/wav'
    unseen_audio_file = f'{scenes_path}/98_speech_true_Alarm_aus_Lüftung_aus_Heizung_aus.wav'

    file_info = sf.info(unseen_audio_file)
    actual_sr = file_info.samplerate
    print(f"Sample Rate: {actual_sr} Hz")

    y, sr = librosa.load(unseen_audio_file, sr=actual_sr)
    audio_duration = librosa.get_duration(y=y, sr=sr)
    print(f"Audio Duration: {audio_duration:.2f} seconds")

    transcription, word_timestamps = infer_with_timestamps(model, processor, unseen_audio_file, sr=actual_sr)
    matches = get_best_matches(transcription, word_timestamps, target_commands, threshold=4)

    print_matches(matches)
    save_predictions_to_csv(matches, "predictions.csv")


Sample Rate: 16000 Hz
Audio Duration: 23.21 seconds
Audio Duration: 23.21 seconds
Predicted Duration: 23.18 seconds
Word: j, Start: 1.04s, End: 1.06s
Word: a, Start: 1.08s, End: 1.10s
Word: a, Start: 1.74s, End: 1.76s
Word: o, Start: 2.02s, End: 2.04s
Word: s, Start: 2.10s, End: 2.12s
Word: d, Start: 2.32s, End: 2.34s
Word: e, Start: 2.36s, End: 2.38s
Word: gg, Start: 2.92s, End: 2.96s
Word: eei, Start: 3.58s, End: 3.64s
Word: nn, Start: 3.66s, End: 3.70s
Word: aaf, Start: 3.94s, End: 4.00s
Word: t, Start: 4.18s, End: 4.20s
Word: o, Start: 4.22s, End: 4.24s
Word: f, Start: 4.40s, End: 4.42s
Word: a, Start: 4.76s, End: 4.78s
Word: ss, Start: 4.88s, End: 4.92s
Word: ii, Start: 4.94s, End: 4.98s
Word: s, Start: 5.06s, End: 5.08s
Word: waa, Start: 6.40s, End: 6.46s
Word: s, Start: 6.54s, End: 6.56s
Word: jj, Start: 7.06s, End: 7.10s
Word: a, Start: 7.16s, End: 7.18s
Word: w, Start: 7.62s, End: 7.64s
Word: i, Start: 7.66s, End: 7.68s
Word: g, Start: 7.78s, End: 7.80s
Word: e, Start: 7.86s, 

#### Whole directory prediction w/o QA

In [23]:
import csv
import os
import librosa
import numpy as np
import soundfile as sf
from Levenshtein import distance as levenshtein_distance
import torch
import re

# Assuming you have the following functions from previous code
# - preprocess_phonemes
# - create_word_to_phoneme_mapping
# - find_segment_timestamps
# - infer_with_timestamps
# - print_matches

def get_best_matches(transcription, word_timestamps, target_commands, threshold=4):
    matches = []
    transcription_words = transcription.split()
    preprocessed_phonemes = preprocess_phonemes(word_timestamps)
    word_start_end = create_word_to_phoneme_mapping(transcription, preprocessed_phonemes)

    for command in target_commands:
        min_distance = float('inf')
        best_match = None
        best_start_time = None
        best_end_time = None

        for i in range(len(transcription_words) - len(command.split()) + 1):
            segment = transcription_words[i:i + len(command.split())]
            segment_str = " ".join(segment)
            dist = levenshtein_distance(segment_str, command)
            if dist < min_distance:
                min_distance = dist
                best_start_time, best_end_time = find_segment_timestamps(segment_str, word_start_end)
                best_match = command

        if min_distance < threshold and best_start_time is not None and best_end_time is not None:
            matches.append((best_match, best_start_time, best_end_time, min_distance))
    return matches

def save_predictions_to_csv(matches, output_file, filename):
    with open(output_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        for match in matches:
            command, start_time, end_time, _ = match
            timestamp = (start_time + end_time) / 2
            writer.writerow([filename, command, timestamp])

def process_audio_files(directory, model, processor, target_commands, threshold=4, output_file="predictions.csv"):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "command", "timestamp"])

    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            audio_file = os.path.join(directory, filename)

            # Check the actual sampling rate
            file_info = sf.info(audio_file)
            actual_sr = file_info.samplerate
            print(f"Processing {filename}, Sample Rate: {actual_sr} Hz")

            # Load the audio file to get its duration
            y, sr = librosa.load(audio_file, sr=actual_sr)
            audio_duration = librosa.get_duration(y=y, sr=sr)
            print(f"Audio Duration: {audio_duration:.2f} seconds")

            transcription, word_timestamps = infer_with_timestamps(model, processor, audio_file, sr=actual_sr)
            matches = get_best_matches(transcription, word_timestamps, target_commands, threshold)

            save_predictions_to_csv(matches, output_file, filename)
            print_matches(matches)

if __name__ == "__main__":
    data_dir = '/content/dataset/scenes/wav'
    target_commands = [
        "alarm aus", "lüftung aus", "heizung aus", "licht an", "licht aus", "radio an", "radio aus",
        "tür auf", "tür zu", "fenster auf", "fenster zu", "rollladen hoch", "rollladen runter",
        "musik an", "musik aus", "tv an", "tv aus", "fernseher an", "fernseher aus", "computer an",
        "computer aus", "klima an", "klima aus", "kaffeemaschine an", "kaffeemaschine aus",
        "ventilator an", "ventilator aus", "staubsauger an", "staubsauger aus", "ofen an", "ofen aus"
    ]

    process_audio_files(data_dir, model, processor, target_commands, threshold=4, output_file="predictions.csv")


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Word: oo, Start: 10.62s, End: 10.66s
Word: a, Start: 10.88s, End: 10.90s
Word: l, Start: 10.94s, End: 10.96s
Word: t, Start: 11.04s, End: 11.06s
Word: h, Start: 11.16s, End: 11.18s
Word: i, Start: 11.20s, End: 11.22s
Word: e, Start: 11.24s, End: 11.26s
Word: r, Start: 11.30s, End: 11.32s
Word: l, Start: 15.04s, End: 15.06s
Word: a, Start: 15.12s, End: 15.14s
Word: f, Start: 15.26s, End: 15.28s
Word: t, Start: 15.36s, End: 15.38s
Word: u, Start: 15.40s, End: 15.42s
Word: nngg, Start: 15.44s, End: 15.52s
Word: a, Start: 15.94s, End: 15.96s
Word: u, Start: 15.98s, End: 16.00s
Word: s, Start: 16.22s, End: 16.24s
Word: ssch, Start: 19.94s, End: 20.02s
Word: o, Start: 20.04s, End: 20.06s
Word: n, Start: 20.10s, End: 20.12s
Word: vv, Start: 20.24s, End: 20.28s
Word: ie, Start: 20.30s, End: 20.34s
Word: ll, Start: 20.38s, End: 20.42s
Word: bbee, Start: 20.50s, End: 20.58s
Word: s, Start: 20.60s, End: 20.62s
Word: 

#### Whole directory preddiction, w/ QA

In [36]:
import csv
import os
import re
import librosa
import numpy as np
import soundfile as sf
from Levenshtein import distance as levenshtein_distance
import torch
#from audio_parsing_utils import parse_full_scene_filename

# Define the list of allowed devices
allowed_devices = ["ofen", "alarm", "lüftung", "heizung", "licht", "fernseher", "staubsauger", "radio"]

def get_best_matches(transcription, word_timestamps, target_commands, threshold=4):
    matches = []
    transcription_words = transcription.split()
    preprocessed_phonemes = preprocess_phonemes(word_timestamps)
    word_start_end = create_word_to_phoneme_mapping(transcription, preprocessed_phonemes)

    for command in target_commands:
        device = command.split()[0].lower()
        if device not in allowed_devices:
            continue  # Skip commands with devices not in the allowed list

        min_distance = float('inf')
        best_match = None
        best_start_time = None
        best_end_time = None

        for i in range(len(transcription_words) - len(command.split()) + 1):
            segment = transcription_words[i:i + len(command.split())]
            segment_str = " ".join(segment)
            dist = levenshtein_distance(segment_str.lower(), command.lower())  # Case insensitive comparison
            if dist < min_distance:
                min_distance = dist
                best_start_time, best_end_time = find_segment_timestamps(segment_str, word_start_end)
                best_match = command

        if min_distance < threshold and best_start_time is not None and best_end_time is not None:
            matches.append((best_match, best_start_time, best_end_time, min_distance))
    return matches

def save_predictions_to_csv(matches, output_file, filename):
    with open(output_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        for match in matches:
            command, start_time, end_time, _ = match
            timestamp = (start_time + end_time) / 2
            writer.writerow([filename, command, timestamp])

def calculate_metrics(predicted, expected):
    predicted_set = set(map(str.lower, predicted))  # Case insensitive comparison
    expected_set = set(map(str.lower, expected))    # Case insensitive comparison

    true_positives = len(predicted_set & expected_set)
    false_positives = len(predicted_set - expected_set)
    false_negatives = len(expected_set - predicted_set)
    correctly_transcribed = true_positives

    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1, true_positives, false_positives, false_negatives, correctly_transcribed

def process_audio_files(directory, model, processor, target_commands, threshold=4, output_file="predictions.csv"):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "command", "timestamp"])

    total_precision, total_recall, total_f1 = 0, 0, 0
    total_tp, total_fp, total_fn, total_ct = 0, 0, 0, 0
    num_files = 0

    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            audio_file = os.path.join(directory, filename)

            # Check the actual sampling rate
            file_info = sf.info(audio_file)
            actual_sr = file_info.samplerate
            print(f"Processing {filename}, Sample Rate: {actual_sr} Hz")

            # Load the audio file to get its duration
            y, sr = librosa.load(audio_file, sr=actual_sr)
            audio_duration = librosa.get_duration(y=y, sr=sr)
            print(f"Audio Duration: {audio_duration:.2f} seconds")

            transcription, word_timestamps = infer_with_timestamps(model, processor, audio_file, sr=actual_sr)
            matches = get_best_matches(transcription, word_timestamps, target_commands, threshold)

            save_predictions_to_csv(matches, output_file, filename)
            print_matches(matches)

            # Extract expected commands from filename
            _, _, expected_commands = parse_full_scene_filename(filename)
            predicted_commands = [match[0] for match in matches]

            precision, recall, f1, tp, fp, fn, ct = calculate_metrics(predicted_commands, expected_commands)
            total_precision += precision
            total_recall += recall
            total_f1 += f1
            total_tp += tp
            total_fp += fp
            total_fn += fn
            total_ct += ct
            num_files += 1

            print(f"Expected: {expected_commands}")
            print(f"Predicted: {predicted_commands}")
            print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
            print(f"TP: {tp}, FP: {fp}, FN: {fn}, CT: {ct}\n")

    # Calculate average metrics
    avg_precision = total_precision / num_files if num_files > 0 else 0
    avg_recall = total_recall / num_files if num_files > 0 else 0
    avg_f1 = total_f1 / num_files if num_files > 0 else 0

    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    print(f"Total TP: {total_tp}, Total FP: {total_fp}, Total FN: {total_fn}, Total CT: {total_ct}")

if __name__ == "__main__":
    data_dir = '/content/dataset/scenes/wav'
    target_commands = [
        "ofen an", "ofen aus", "alarm an", "alarm aus", "lüftung an", "lüftung aus",
        "heizung an", "heizung aus", "licht an", "licht aus", "fernseher an", "fernseher aus",
        "staubsauger an", "staubsauger aus", "radio an", "radio aus"
    ]

    process_audio_files(data_dir, model, processor, target_commands, threshold=4, output_file="predictions.csv")


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Finding segment: licht au
Segment 'licht au' found: Start=8.42, End=9.58
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: licht au
Segment 'licht au' found: Start=8.42, End=9.58
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment: licht au
Segment 'licht au' found: Start=8.42, End=9.58
Finding segment: heizug aus
Segment 'heizug aus' found: Start=5.98, End=7.3
Finding segment:

### Testing stuff, not relevant

But I didn't dare deleting it...

In [13]:
import re

# Provided word timestamps (phonemes)
words_timestamps = [
    ("j", 1.02, 1.04), ("a", 1.72, 1.74), ("e", 2.36, 2.38), ("k", 2.42, 2.44),
    ("j", 2.58, 2.60), ("e", 2.70, 2.72), ("n", 2.78, 2.80), ("e", 2.82, 2.84),
    ("g", 2.94, 2.96), ("n", 3.66, 3.68), ("r", 3.90, 3.92), ("f", 4.04, 4.06),
    ("s", 4.10, 4.12), ("t", 4.18, 4.20), ("f", 4.40, 4.42), ("k", 4.70, 4.72),
    ("aa", 4.76, 4.80), ("s", 4.90, 4.92), ("i", 4.94, 4.96), ("s", 5.04, 5.06),
    ("a", 6.42, 6.44), ("s", 6.56, 6.58), ("a", 7.14, 7.16), ("i", 7.64, 7.66),
    ("g", 7.80, 7.82), ("s", 7.98, 8.00), ("e", 8.04, 8.06), ("g", 8.12, 8.14),
    ("e", 8.16, 8.18), ("i", 8.74, 8.76), ("ch", 8.78, 8.82), ("cha", 8.86, 8.92),
    ("t", 9.02, 9.04), ("e", 9.06, 9.08), ("a", 9.12, 9.14), ("n", 9.18, 9.20),
    ("cc", 9.34, 9.38), ("ä", 9.42, 9.44), ("s", 9.56, 9.58), ("aau", 9.68, 9.74),
    ("en", 9.86, 9.90), ("d", 9.94, 9.96), ("s", 10.08, 10.10), ("li", 10.34, 10.38),
    ("s", 10.42, 10.44), ("ne", 10.48, 10.52), ("t", 10.60, 10.62), ("a", 11.52, 11.54),
    ("l", 11.56, 11.58), ("l", 11.62, 11.64), ("a", 11.76, 11.78), ("n", 11.90, 11.92),
    ("aa", 12.54, 12.58), ("u", 12.60, 12.62), ("s", 12.84, 12.86), ("l", 14.32, 14.34),
    ("ü", 14.36, 14.38), ("f", 14.48, 14.50), ("t", 14.62, 14.64), ("nnng", 14.74, 14.82),
    ("a", 15.08, 15.10), ("u", 15.14, 15.16), ("u", 15.18, 15.20), ("s", 15.36, 15.38),
    ("n", 17.56, 17.58), ("eeii", 17.60, 17.68), ("g", 17.74, 17.76), ("n", 17.86, 17.88),
    ("e", 17.90, 17.92), ("a", 18.92, 18.94), ("a", 18.96, 18.98), ("s", 19.12, 19.14),
    ("o", 19.16, 19.18), ("e", 19.94, 19.96), ("t", 20.02, 20.04), ("zz", 20.10, 20.14),
    ("u", 20.22, 20.24), ("n", 20.28, 20.30), ("g", 20.34, 20.36), ("aaauu", 20.76, 20.86),
    ("s", 21.04, 21.06)
]

# Provided transcription and command
transcription = "ja ekjenegnrfstfkasis as a i gse ge ich chateancäs auen d slisnet allan aus lüftng auus neigne aaso etzung aus"
command = "allan aus"
threshold = 4

# Levenshtein distance function
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

# Preprocess the phoneme list to remove double letters
def preprocess_phonemes(phonemes):
    preprocessed = []
    for phoneme, start, end in phonemes:
        processed_phoneme = re.sub(r'(.)\1+', r'\1', phoneme)  # Remove double letters
        preprocessed.append((processed_phoneme, start, end))
    return preprocessed

preprocessed_phonemes = preprocess_phonemes(words_timestamps)

# Create a sequential mapping of words to phonemes
def map_transcription_to_timestamps(words_timestamps, transcription_words):
    word_start_end = []
    current_word_index = 0
    current_word_phonemes = ""
    current_word_start_time = None

    for phoneme, start, end in words_timestamps:
        if current_word_start_time is None:
            current_word_start_time = start

        current_word_phonemes += phoneme

        # Check if the accumulated phonemes form the current word
        if current_word_phonemes == transcription_words[current_word_index]:
            word_start_end.append((transcription_words[current_word_index], current_word_start_time, end))
            current_word_index += 1
            current_word_phonemes = ""
            current_word_start_time = None

            if current_word_index >= len(transcription_words):
                break

    return word_start_end

word_start_end = map_transcription_to_timestamps(preprocessed_phonemes, transcription.split())

# Function to get best matches using the mapping
def get_best_matches(transcription_words, word_start_end, command, threshold):
    command_length = len(command.split())
    min_distance = float('inf')
    best_match = None
    best_start_time = 0
    best_end_time = 0

    for i in range(len(transcription_words) - command_length + 1):
        segment_words = transcription_words[i:i + command_length]
        segment_str = " ".join(segment_words)
        dist = levenshtein_distance(segment_str, command)

        segment_start_time = word_start_end[i][1]
        segment_end_time = word_start_end[i + command_length - 1][2]

        if dist <= threshold:
            if dist < min_distance:
                min_distance = dist
                best_match = segment_str
                best_start_time = segment_start_time
                best_end_time = segment_end_time
            print(f"Accepted Segment: {segment_str}, Distance: {dist}, Start: {segment_start_time:.2f}, End: {segment_end_time:.2f}")
        else:
            print(f"Rejected Segment: {segment_str}, Distance: {dist}, Start: {segment_start_time:.2f}, End: {segment_end_time:.2f}")

    return best_match, best_start_time, best_end_time

# Get best matches
best_match, best_start_time, best_end_time = get_best_matches(transcription.split(), word_start_end, command, threshold)

print(f"Best Match: {best_match}, Start Time: {best_start_time}, End Time: {best_end_time}")


Rejected Segment: ja ekjenegnrfstfkasis, Distance: 17, Start: 1.02, End: 5.06
Rejected Segment: ekjenegnrfstfkasis as, Distance: 18, Start: 2.36, End: 6.58
Rejected Segment: as a, Distance: 6, Start: 6.42, End: 7.16
Rejected Segment: a i, Distance: 7, Start: 7.14, End: 7.66
Rejected Segment: i gse, Distance: 8, Start: 7.64, End: 8.06
Rejected Segment: gse ge, Distance: 8, Start: 7.80, End: 8.18
Rejected Segment: ge ich, Distance: 8, Start: 8.12, End: 8.82
Rejected Segment: ich chateancäs, Distance: 11, Start: 8.74, End: 9.58
Rejected Segment: chateancäs auen, Distance: 9, Start: 8.86, End: 9.90
Rejected Segment: auen d, Distance: 6, Start: 9.68, End: 9.96
Rejected Segment: d slisnet, Distance: 9, Start: 9.94, End: 10.62
Rejected Segment: slisnet allan, Distance: 9, Start: 10.08, End: 11.92
Accepted Segment: allan aus, Distance: 0, Start: 11.52, End: 12.86
Rejected Segment: aus lüftng, Distance: 9, Start: 12.54, End: 14.82
Rejected Segment: lüftng auus, Distance: 6, Start: 14.32, End: 1

In [14]:
import re

words_timestamps = [
    ("j", 1.02, 1.04), ("a", 1.72, 1.74), ("e", 2.36, 2.38), ("k", 2.42, 2.44),
    ("j", 2.58, 2.60), ("e", 2.70, 2.72), ("n", 2.78, 2.80), ("e", 2.82, 2.84),
    ("g", 2.94, 2.96), ("n", 3.66, 3.68), ("r", 3.90, 3.92), ("f", 4.04, 4.06),
    ("s", 4.10, 4.12), ("t", 4.18, 4.20), ("f", 4.40, 4.42), ("k", 4.70, 4.72),
    ("aa", 4.76, 4.80), ("s", 4.90, 4.92), ("i", 4.94, 4.96), ("s", 5.04, 5.06),
    ("a", 6.42, 6.44), ("s", 6.56, 6.58), ("a", 7.14, 7.16), ("i", 7.64, 7.66),
    ("g", 7.80, 7.82), ("s", 7.98, 8.00), ("e", 8.04, 8.06), ("g", 8.12, 8.14),
    ("e", 8.16, 8.18), ("i", 8.74, 8.76), ("ch", 8.78, 8.82), ("cha", 8.86, 8.92),
    ("t", 9.02, 9.04), ("e", 9.06, 9.08), ("a", 9.12, 9.14), ("n", 9.18, 9.20),
    ("cc", 9.34, 9.38), ("ä", 9.42, 9.44), ("s", 9.56, 9.58), ("aau", 9.68, 9.74),
    ("en", 9.86, 9.90), ("d", 9.94, 9.96), ("s", 10.08, 10.10), ("li", 10.34, 10.38),
    ("s", 10.42, 10.44), ("ne", 10.48, 10.52), ("t", 10.60, 10.62), ("a", 11.52, 11.54),
    ("l", 11.56, 11.58), ("l", 11.62, 11.64), ("a", 11.76, 11.78), ("n", 11.90, 11.92),
    ("aa", 12.54, 12.58), ("u", 12.60, 12.62), ("s", 12.84, 12.86), ("l", 14.32, 14.34),
    ("ü", 14.36, 14.38), ("f", 14.48, 14.50), ("t", 14.62, 14.64), ("nnng", 14.74, 14.82),
    ("a", 15.08, 15.10), ("u", 15.14, 15.16), ("u", 15.18, 15.20), ("s", 15.36, 15.38),
    ("n", 17.56, 17.58), ("eeii", 17.60, 17.68), ("g", 17.74, 17.76), ("n", 17.86, 17.88),
    ("e", 17.90, 17.92), ("a", 18.92, 18.94), ("a", 18.96, 18.98), ("s", 19.12, 19.14),
    ("o", 19.16, 19.18), ("e", 19.94, 19.96), ("t", 20.02, 20.04), ("zz", 20.10, 20.14),
    ("u", 20.22, 20.24), ("n", 20.28, 20.30), ("g", 20.34, 20.36), ("aaauu", 20.76, 20.86),
    ("s", 21.04, 21.06)
]

transcription = "ja ekjenegnrfstfkasis as a i gse ge ich chateancäs auen d slisnet allan aus lüftng auus neigne aaso etzung aus"
transcription_words = transcription.split()

# Preprocess the phoneme list to remove double letters
def preprocess_phonemes(phonemes):
    preprocessed = []
    print("Preprocessing phonemes:")
    for phoneme, start, end in phonemes:
        processed_phoneme = re.sub(r'(.)\1+', r'\1', phoneme)  # Remove double letters
        print(f"Original phoneme: '{phoneme}' -> Processed phoneme: '{processed_phoneme}'")
        preprocessed.append((processed_phoneme, start, end))
    return preprocessed

preprocessed_phonemes = preprocess_phonemes(words_timestamps)

def map_transcription_to_timestamps(words_timestamps, transcription_words):
    word_start_end = []
    current_word_index = 0
    current_word_phonemes = ""
    current_word_start_time = None

    print("Mapping phonemes to transcription words:")
    for phoneme, start, end in words_timestamps:
        if current_word_start_time is None:
            current_word_start_time = start

        current_word_phonemes += phoneme

        # Print the debug information
        print(f"Current accumulated phonemes: '{current_word_phonemes}', Matching against: '{transcription_words[current_word_index]}'")

        # Check if the accumulated phonemes form the current word
        if current_word_phonemes == transcription_words[current_word_index]:
            print(f"Match found for word '{transcription_words[current_word_index]}' with start time {current_word_start_time} and end time {end}")
            word_start_end.append((transcription_words[current_word_index], current_word_start_time, end))
            current_word_index += 1
            current_word_phonemes = ""
            current_word_start_time = None

            if current_word_index >= len(transcription_words):
                break

    print("\nFinal word_start_end mapping:")
    for word, start, end in word_start_end:
        print(f"Word: {word}, Start: {start}, End: {end}")

    return word_start_end

word_start_end = map_transcription_to_timestamps(preprocessed_phonemes, transcription_words)

def find_segment_timestamps(segment, word_start_end):
    segment_words = segment.split()
    segment_length = len(segment_words)

    print(f"\nFinding segment '{segment}' in word_start_end:")
    for i in range(len(word_start_end) - segment_length + 1):
        match = True
        for j in range(segment_length):
            if word_start_end[i + j][0] != segment_words[j]:
                match = False
                break
        if match:
            start_time = word_start_end[i][1]
            end_time = word_start_end[i + segment_length - 1][2]
            return start_time, end_time

    return None, None

segment = "allan aus"
start_time, end_time = find_segment_timestamps(segment, word_start_end)

if start_time is not None and end_time is not None:
    print(f"\nSegment '{segment}' starts at {start_time} seconds and ends at {end_time} seconds.")
else:
    print(f"\nSegment '{segment}' not found in the transcription.")


Preprocessing phonemes:
Original phoneme: 'j' -> Processed phoneme: 'j'
Original phoneme: 'a' -> Processed phoneme: 'a'
Original phoneme: 'e' -> Processed phoneme: 'e'
Original phoneme: 'k' -> Processed phoneme: 'k'
Original phoneme: 'j' -> Processed phoneme: 'j'
Original phoneme: 'e' -> Processed phoneme: 'e'
Original phoneme: 'n' -> Processed phoneme: 'n'
Original phoneme: 'e' -> Processed phoneme: 'e'
Original phoneme: 'g' -> Processed phoneme: 'g'
Original phoneme: 'n' -> Processed phoneme: 'n'
Original phoneme: 'r' -> Processed phoneme: 'r'
Original phoneme: 'f' -> Processed phoneme: 'f'
Original phoneme: 's' -> Processed phoneme: 's'
Original phoneme: 't' -> Processed phoneme: 't'
Original phoneme: 'f' -> Processed phoneme: 'f'
Original phoneme: 'k' -> Processed phoneme: 'k'
Original phoneme: 'aa' -> Processed phoneme: 'a'
Original phoneme: 's' -> Processed phoneme: 's'
Original phoneme: 'i' -> Processed phoneme: 'i'
Original phoneme: 's' -> Processed phoneme: 's'
Original phonem