In [7]:
import os
import numpy as np
import pandas as pd
import librosa
from scipy import signal
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from abc import ABC, abstractmethod

import torch
import torchaudio
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer
from pystoi import stoi
import nltk
from nltk.corpus import cmudict
from scipy import signal
import librosa
from sklearn.decomposition import PCA


In [8]:
import os
import json
import torch
import numpy as np




class LogisticRegressionClassifier(torch.nn.Module):
    def __init__(self, feature_dim=129):
        super(LogisticRegressionClassifier, self).__init__()
        self.linear1 = torch.nn.Linear(feature_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, xx):
        return self.sigmoid(self.linear1(torch.nn.functional.normalize(xx, p=1.0, dim=1)))


def load_kirigami_model():
    # Phoneme Model!
    my_phoneme_filter_model = LogisticRegressionClassifier(feature_dim=129)
    my_phoneme_filter_model.load_state_dict(
    torch.load(lr_phoneme_checkpoint_path, map_location=torch.device('cpu')))
    my_phoneme_filter_model.eval()

    return my_phoneme_filter_model


def load_background_filter_model():
    # Background Model!
    my_background_filter_model = LogisticRegressionClassifier(feature_dim=129)
    my_background_filter_model.load_state_dict(torch.load(bg_lr_checkpoint_path, map_location=torch.device('cpu')))
    my_background_filter_model.eval()

    return my_background_filter_model


def kirigami_filter_torch(s_full, threshold=0.5):
    lr_phoneme_filter_model = LogisticRegressionClassifier(feature_dim=129)
    # load the model if in kirigami_filters directory
    if os.path.exists("./kirigami_filters/scipy_phoneme_filter.ckpt"):
        lr_phoneme_filter_model.load_state_dict(
            torch.load("./kirigami_filters/scipy_phoneme_filter.ckpt", map_location=device))
    else:
        raise FileNotFoundError("Phoneme filter model not found")

    lr_phoneme_filter_model.eval()
    pred = (lr_phoneme_filter_model.forward(torch.Tensor(s_full)) >= threshold).long().numpy()
    masked = (1 - pred) * s_full
    return masked


def kirigami_filter(stft):
    output_sp = np.zeros_like(stft)
    for i, fft in enumerate(stft):

        sum = np.sum(fft)

        product = 0
        for iw, (vv, ww) in enumerate(zip(fft, weight)):
            product = product + vv * weight[iw]
        product = product / sum
        product = product + bias

        z = 1 / (1 + np.exp(-product))
        # print("LR filter probability", i, z)
        if z < LR_THRESHOLD:
            # add the value
            output_sp[i] = stft[i]
    return output_sp


def kirigami_filter_reverse_fft(stft, stft_original):
    output_sp = np.zeros_like(stft)
    for i, fft in enumerate(stft):
        sum = np.sum(fft)
        product = 0
        for iw, (vv, ww) in enumerate(zip(fft, weight)):
            product = product + vv * weight[iw]
        product = product / sum
        product = product + bias
        z = 1 / (1 + np.exp(-product))
        # print("LR filter probability", i, z)
        if z < LR_THRESHOLD:
            # add the value
            # output_sp[i] = stft[i]
            output_sp[i] = stft_original[i]
    return output_sp


def background_detection_filter(stft):
    output_sp = np.zeros_like(stft)
    for i, fft in enumerate(stft):
        sum = np.sum(fft)
        product = 0
        for iw, (vv, ww) in enumerate(zip(fft, weight_background)):
            product = product + vv * weight_background[iw]
        product = product // sum
        product = product + bias_background
        z = 1 / (1 + np.exp(-product))
        # print("Background probability: ", i, z)
        if z < BACKGROUND_LR_THRESHOLD:  # lower than threshold not background.
            # add the value
            output_sp[i] = stft[i]
    return output_sp


import os
import librosa
import numpy as np
import torch
import soundfile as sf
import matplotlib.pyplot as plt

# Load Kirigami models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_kirigami_model():
    """Load the phoneme filter model."""
    model = LogisticRegressionClassifier(feature_dim=129)
    model.load_state_dict(torch.load("kirigami_filters/model_checkpoints/scipy_phoneme_filter.ckpt", map_location=device))
    model.eval()
    return model

def load_background_filter_model():
    """Load the background noise filter model."""
    model = LogisticRegressionClassifier(feature_dim=129)
    model.load_state_dict(torch.load("kirigami_filters/model_checkpoints/noisy_background_scipy_detector.ckpt", map_location=device))
    model.eval()
    return model

# Load models
phoneme_filter_model = load_kirigami_model()
background_filter_model = load_background_filter_model()

# Extract model weights
weight_phoneme = phoneme_filter_model.linear1.weight.data[0].numpy()
bias_phoneme = phoneme_filter_model.linear1.bias.data[0].numpy()
weight_background = background_filter_model.linear1.weight.data[0].numpy()
bias_background = background_filter_model.linear1.bias.data[0].numpy()



def apply_kirigami_filter(stft, weight, bias, threshold=0.5):
    """Apply Kirigami logistic regression filter on STFT features (ensuring 129 features per frame)."""
    output_stft = np.zeros_like(stft)

    for i, frame in enumerate(stft):
        frame = frame[:129]  # Ensure exactly 129 dimensions

        sum_val = np.sum(frame) + 1e-6  # Avoid division by zero
        product = np.dot(frame, weight) / sum_val + bias
        prob = 1 / (1 + np.exp(-product))  # Sigmoid activation

        if prob < threshold:  # If probability is low, keep frame
            output_stft[i, :129] = frame  # Apply only to the valid region

    return output_stft

# Process Audio File
def Kirigami_process_audio( audio, sr=16000, threshold=0.5):
    """Process an audio file through Kirigami models and save the filtered output."""
    # Load audio
    # audio, sr = librosa.load(input_audio_path, sr=16000)

    # Compute STFT (Ensure output has 129 feature bins)
    stft = np.abs(librosa.stft(audio, n_fft=256, hop_length=128))[:129, :].T  # Transpose for correct shape

    # Apply Kirigami phoneme & background filters
    filtered_stft_phoneme = apply_kirigami_filter(stft, weight_phoneme, bias_phoneme, threshold)
    # filtered_stft_background = self.apply_kirigami_filter(filtered_stft_phoneme, weight_background, bias_background, threshold)

    # Convert back to audio using inverse STFT
    filtered_audio = librosa.istft(filtered_stft_phoneme.T, hop_length=128)

    return filtered_audio





  model.load_state_dict(torch.load("kirigami_filters/model_checkpoints/scipy_phoneme_filter.ckpt", map_location=device))
  model.load_state_dict(torch.load("kirigami_filters/model_checkpoints/noisy_background_scipy_detector.ckpt", map_location=device))


In [9]:
import torch
import torchaudio



class TIMITEvaluator:
    def __init__(self):
        self.device = torch.device("cuda")
        print(f"Using device: {self.device}")
        
        try:
            self.cmudict = cmudict.dict()
        except LookupError:
            nltk.download('cmudict')
            self.cmudict = cmudict.dict()
        
        model_name = "facebook/wav2vec2-base-960h"
        print(f"Loading {model_name}...")
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(self.device)
        self.model.eval()
        self.sample_rate = 16000

        self.phoneme_map = {
            'aa': 'aa', 'ae': 'ae', 'ah': 'ah', 'ao': 'aa', 'aw': 'aw',
            'ax': 'ah', 'ax-h': 'ah', 'axr': 'er', 'ay': 'ay', 'b': 'b',
            'bcl': 'b', 'ch': 'ch', 'd': 'd', 'dcl': 'd', 'dh': 'dh',
            'dx': 't', 'eh': 'eh', 'el': 'l', 'em': 'm', 'en': 'n',
            'eng': 'ng', 'epi': '', 'er': 'er', 'ey': 'ey', 'f': 'f',
            'g': 'g', 'gcl': 'g', 'h#': '', 'hh': 'hh', 'hv': 'hh',
            'ih': 'ih', 'ix': 'ih', 'iy': 'iy', 'jh': 'jh', 'k': 'k',
            'kcl': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'ng': 'ng',
            'nx': 'n', 'ow': 'ow', 'oy': 'oy', 'p': 'p', 'pau': '',
            'pcl': 'p', 'q': '', 'r': 'r', 's': 's', 'sh': 'sh',
            't': 't', 'tcl': 't', 'th': 'th', 'uh': 'uh', 'uw': 'uw',
            'ux': 'uw', 'v': 'v', 'w': 'w', 'y': 'y', 'z': 'z', 'zh': 'zh'
        }

        self.cmu_to_timit = {
            'AA': 'aa', 'AE': 'ae', 'AH': 'ah', 'AO': 'aa', 'AW': 'aw',
            'AY': 'ay', 'B': 'b', 'CH': 'ch', 'D': 'd', 'DH': 'dh',
            'EH': 'eh', 'ER': 'er', 'EY': 'ey', 'F': 'f', 'G': 'g',
            'HH': 'hh', 'IH': 'ih', 'IY': 'iy', 'JH': 'jh', 'K': 'k',
            'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ng', 'OW': 'ow',
            'OY': 'oy', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'sh',
            'T': 't', 'TH': 'th', 'UH': 'uh', 'UW': 'uw', 'V': 'v',
            'W': 'w', 'Y': 'y', 'Z': 'z', 'ZH': 'zh'
        }

    def get_phonemes(self, phn_file):
        phonemes = []
        with open(phn_file, 'r') as f:
            for line in f:
                _, _, phone = line.strip().split()
                phone = phone.lower()
                mapped_phone = self.phoneme_map.get(phone, phone)
                if mapped_phone:
                    phonemes.append(mapped_phone)
        return self.normalize_phoneme_sequence(phonemes)

    def normalize_phoneme_sequence(self, phonemes):
        phonemes = [p for p in phonemes if p]
        normalized = []
        for i, phone in enumerate(phonemes):
            if i == 0 or phone != phonemes[i-1]:
                normalized.append(phone)
        return normalized

    def convert_to_phonemes(self, text):
        phones = []
        for word in text.lower().split():
            if word in self.cmudict:
                word_phones = self.cmudict[word][0]
                timit_phones = [self.cmu_to_timit[p.rstrip('0123456789')] 
                              for p in word_phones]
                phones.extend(timit_phones)
            else:
                char_phones = self.convert_word_to_phonemes(word)
                phones.extend(char_phones)
        return self.normalize_phoneme_sequence(phones)

    def convert_word_to_phonemes(self, word):
        phones = []
        i = 0
        while i < len(word):
            if i < len(word) - 1:
                digraph = word[i:i+2]
                if digraph in ['th', 'ch', 'sh', 'ph', 'wh', 'gh']:
                    phones.append('th' if digraph == 'th' else
                                'ch' if digraph == 'ch' else
                                'sh' if digraph == 'sh' else
                                'f' if digraph == 'ph' else
                                'w' if digraph == 'wh' else 'g')
                    i += 2
                    continue
            
            c = word[i]
            if c in 'aeiou':
                phones.append('ae' if c == 'a' else
                            'eh' if c == 'e' else
                            'ih' if c == 'i' else
                            'ow' if c == 'o' else 'uh')
            elif c in self.phoneme_map:
                phones.append(self.phoneme_map[c])
            elif c in 'bcdfghjklmnpqrstvwxyz':
                phones.append(c)
            i += 1
        return phones

    def calculate_per(self, ref_phones, pred_phones):
        if not ref_phones:
            return 0.0

        ref_phones = self.normalize_phoneme_sequence(ref_phones)
        pred_phones = self.normalize_phoneme_sequence(pred_phones)

        R = len(ref_phones)
        H = len(pred_phones)
        D = np.zeros((R + 1, H + 1))
        
        for i in range(R + 1):
            D[i, 0] = i
        for j in range(H + 1):
            D[0, j] = j
        
        for i in range(1, R + 1):
            for j in range(1, H + 1):
                if ref_phones[i-1] == pred_phones[j-1]:
                    D[i, j] = D[i-1, j-1]
                else:
                    sub_cost = 1.0
                    if self.are_similar_phones(ref_phones[i-1], pred_phones[j-1]):
                        sub_cost = 0.5
                    D[i, j] = min(D[i-1, j] + 1.0,
                                D[i, j-1] + 1.0,
                                D[i-1, j-1] + sub_cost)
        
        return (D[R, H] / R) * 100 if R else 0.0

    def are_similar_phones(self, phone1, phone2):
        groups = [
            {'aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'eh', 'er', 'ey', 'ih', 'iy', 'ow', 'oy', 'uh', 'uw'},
            {'b', 'p', 'd', 't', 'g', 'k'},
            {'f', 'v', 'th', 'dh', 's', 'z', 'sh', 'zh', 'hh'},
            {'m', 'n', 'ng'},
            {'l', 'r'},
            {'w', 'y'}
        ]
        return any(phone1 in group and phone2 in group for group in groups)

    def apply_samosa(self, audio, sr=16000):
        subsampled_audio = signal.resample(audio, int(len(audio) * 1000 / sr))
        upsampled_audio = signal.resample(subsampled_audio, len(audio))
        return upsampled_audio

    def apply_synthetic_sensor(self, audio, sr=16000):
        window_length = 256
        hop_length = 128
        samples_per_window = 10

        frames = [
            audio[i:i + window_length]
            for i in range(0, len(audio) - window_length + 1, hop_length)
        ]
        
        processed_audio = np.zeros(len(audio))
        
        for i, frame in enumerate(frames):
            fft = np.fft.fft(frame, n=window_length)
            bins = np.array_split(fft, samples_per_window)
            reduced_fft = np.array([np.mean(bin) for bin in bins])
            reconstructed_fft = np.zeros_like(fft, dtype=np.complex_)
            step = len(fft) // len(reduced_fft)
            for j, val in enumerate(reduced_fft):
                reconstructed_fft[j * step:(j + 1) * step] = val
            processed_frame = np.fft.ifft(reconstructed_fft).real
            start = i * hop_length
            end = min(start + window_length, len(processed_audio))
            processed_audio[start:end] += processed_frame[:end - start]
        
        if np.max(np.abs(processed_audio)) > 0:
            processed_audio = processed_audio / np.max(np.abs(processed_audio))
            
        return processed_audio

    def apply_privacymic(self, audio, sr=16000):
        S = librosa.stft(audio, n_fft=256, hop_length=128)
        frequencies = librosa.fft_frequencies(sr=sr, n_fft=256)
        S_filtered = np.where(frequencies[:, None] <= 300, S, 0)
        return librosa.istft(S_filtered, hop_length=128)
    
    def apply_kirigami_filter(self, stft, weight, bias, threshold=0.5):
        """Apply Kirigami logistic regression filter on STFT features (ensuring 129 features per frame)."""
        output_stft = np.zeros_like(stft)

        for i, frame in enumerate(stft):
            frame = frame[:129]  # Ensure exactly 129 dimensions

            sum_val = np.sum(frame) + 1e-6  # Avoid division by zero
            product = np.dot(frame, weight) / sum_val + bias
            prob = 1 / (1 + np.exp(-product))  # Sigmoid activation

            if prob < threshold:  # If probability is low, keep frame
                output_stft[i, :129] = frame  # Apply only to the valid region

        return output_stft

    # Process Audio File
    def Kirigami_process_audio(self,  audio, sr=16000, threshold=0.5):
        """Process an audio file through Kirigami models and save the filtered output."""
        # Load audio
        # audio, sr = librosa.load(input_audio_path, sr=16000)

        # Compute STFT (Ensure output has 129 feature bins)
        stft = np.abs(librosa.stft(audio, n_fft=256, hop_length=128))[:129, :].T  # Transpose for correct shape

        # Apply Kirigami phoneme & background filters
        filtered_stft_phoneme = self.apply_kirigami_filter(stft, weight_phoneme, bias_phoneme, threshold)
        # filtered_stft_background = self.apply_kirigami_filter(filtered_stft_phoneme, weight_background, bias_background, threshold)

        # Convert back to audio using inverse STFT
        filtered_audio = librosa.istft(filtered_stft_phoneme.T, hop_length=128)

        return filtered_audio

    def apply_coughsense_withphase(self, audio, sr=16000):
        """CoughSense filtering with phase preservation"""
        # Calculate window size for 150ms window
        window_size = int(0.150 * sr)  # 150ms window in samples
        hop_length = window_size//2  # stride size
        
        # Compute STFT
        S = librosa.stft(audio, n_fft=window_size, hop_length=hop_length, win_length=window_size)
        
        # Get magnitude spectrogram
        mag_spec = np.abs(S)
        
        # Reshape for PCA
        features = mag_spec.T
        
        # Apply PCA
        pca = PCA(n_components=10)
        reduced_features = pca.fit_transform(features)
        
        # Reconstruct
        reconstructed_features = pca.inverse_transform(reduced_features)
        
        # Reshape back to STFT format
        reconstructed_stft = reconstructed_features.T
        
        # Preserve phase information
        phase = np.angle(S)
        reconstructed_complex = reconstructed_stft * np.exp(1j * phase)
        
        # Inverse STFT
        filtered_audio = librosa.istft(reconstructed_complex, 
                                     hop_length=hop_length, 
                                     win_length=window_size)
        
        # Ensure output length matches input
        if len(filtered_audio) > len(audio):
            filtered_audio = filtered_audio[:len(audio)]
        elif len(filtered_audio) < len(audio):
            filtered_audio = np.pad(filtered_audio, (0, len(audio) - len(filtered_audio)))
            
        return filtered_audio

    def apply_coughsense_withoutphase(self, audio, sr=16000):
        """CoughSense filtering without phase preservation"""
        # Use a 150 ms window and 50% overlap
        n_fft = int(0.15 * sr)  # 150 ms window
        hop_length = n_fft // 2  # 50% overlap
        
        # Compute spectrogram
        S = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length, window='hamming'))
        
        # Flatten and apply PCA
        S_flattened = S.T  # Transpose for PCA
        
        # Apply PCA
        pca = PCA(n_components=10)  # Retain 10 components
        S_reduced = pca.fit_transform(S_flattened)
        S_reconstructed = pca.inverse_transform(S_reduced).T
        
        # Inverse STFT
        filtered_audio = librosa.istft(S_reconstructed, 
                                     hop_length=hop_length, 
                                     win_length=n_fft)
                                     
        # Ensure output length matches input
        if len(filtered_audio) > len(audio):
            filtered_audio = filtered_audio[:len(audio)]
        elif len(filtered_audio) < len(audio):
            filtered_audio = np.pad(filtered_audio, (0, len(audio) - len(filtered_audio)))
            
        return filtered_audio

    def calculate_estoi(self, clean_audio, processed_audio, sr=16000):
        """Calculate extended Short-Time Objective Intelligibility (eSTOI)."""
        min_length = min(len(clean_audio), len(processed_audio))
        clean_audio = clean_audio[:min_length]
        processed_audio = processed_audio[:min_length]
        return stoi(clean_audio, processed_audio, sr, extended=True)


    def process_file(self, wav_path, method=None):
        try:
            # Load audio
            audio, sr = torchaudio.load(wav_path)
            audio = torch.mean(audio, dim=0) if len(audio.shape) > 1 else audio
            if sr != self.sample_rate:
                audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
            audio = audio.numpy()

            # Store original audio for ESTOI calculation
            original_audio = audio.copy()

            # Apply processing method
            if method == "samosa":
                processed_audio = self.apply_samosa(audio, self.sample_rate)
            elif method == "synthetic_sensor":
                processed_audio = self.apply_synthetic_sensor(audio, self.sample_rate)
            elif method == "privacymic":
                processed_audio = self.apply_privacymic(audio, self.sample_rate)
            elif method == "coughsense_withphase":
                processed_audio = self.apply_coughsense_withphase(audio, self.sample_rate)
            elif method == "coughsense_withoutphase":
                processed_audio = self.apply_coughsense_withoutphase(audio, self.sample_rate)
            elif method == "Kirigami_process_file":
                processed_audio = self.Kirigami_process_audio(audio )
            else:
                processed_audio = audio

            # Calculate ESTOI
            sample_estoi = self.calculate_estoi(original_audio, processed_audio, self.sample_rate) * 100

            # Get ASR prediction using processed audio
            inputs = self.processor(processed_audio, sampling_rate=self.sample_rate, 
                                  return_tensors="pt", padding=True).to(self.device)
            with torch.no_grad():
                outputs = self.model(inputs.input_values)
                pred_ids = torch.argmax(outputs.logits, dim=-1)
                pred_text = self.processor.batch_decode(pred_ids)[0].lower()

            # Get reference transcriptions
            phn_path = wav_path.replace('.WAV', '.PHN')
            wrd_path = wav_path.replace('.WAV', '.WRD')
            
            # Get reference phonemes and text
            ref_phones = self.get_phonemes(phn_path)
            with open(wrd_path, 'r') as f:
                true_text = ' '.join([line.strip().split()[-1] for line in f]).lower()

            # Convert predicted text to phonemes
            pred_phones = self.convert_to_phonemes(pred_text)

            # Calculate metrics
            sample_wer = wer(true_text, pred_text) * 100
            sample_per = self.calculate_per(ref_phones, pred_phones)

            return {
                "file": os.path.basename(wav_path),
                "true_text": true_text,
                "pred_text": pred_text,
                "ref_phones": ref_phones,
                "pred_phones": pred_phones,
                "wer": sample_wer,
                "per": sample_per,
                "estoi": sample_estoi
            }

        except Exception as e:
            print(f"Error processing {wav_path}: {e}")
            return None
        
    def process_dataset(self, dataset_path, method=None):
        results = []

        for root, _, files in os.walk(dataset_path):
            for file in sorted(files):
                if file.endswith('.WAV'):
                    if file_count>=max_files:
                        return results
                    wav_path = os.path.join(root, file)
                    result = self.process_file(wav_path, method)
                    if result:
                        results.append(result)
                    
        return results
    
def main():
    data_path = '/work/pi_shenoy_umass_edu/sgomasta_umass_edu/WER-PER/TIMIT/data/'
    evaluator = TIMITEvaluator()

    methods = [None, "samosa", "synthetic_sensor", "privacymic", 
              "coughsense_withphase", "coughsense_withoutphase", "Kirigami_process_Audio"]
    results_summary = {}

    for method in methods:
        print(f"\nProcessing method: {method or 'raw'}")
        results = evaluator.process_dataset(data_path, method)
        df = pd.DataFrame(results)

        if not df.empty:
            method_name = method if method else "Raw Audio"
            if method in {
                "Kirigami_process_Audio": "Kirigami",
                "samosa": "SAMOSA",
                "synthetic_sensor": "Synthetic Sensor",
                "privacymic": "PrivacyMic",
                "coughsense_withphase": "CoughSense (with phase)",
                "coughsense_withoutphase": "CoughSense (without phase)"
            }:
                method_name = {
                    "Kirigami_process_Audio": "Kirigami",
                    "samosa": "SAMOSA",
                    "synthetic_sensor": "Synthetic Sensor",
                    "privacymic": "PrivacyMic",
                    "coughsense_withphase": "CoughSense (with phase)",
                    "coughsense_withoutphase": "CoughSense (without phase)"
                }[method]
            
            avg_wer = df['wer'].mean()
            avg_per = df['per'].mean()
            avg_estoi = df['estoi'].mean()
            
            print(f"\nMethod: {method_name}")
            print(f"Average WER: {avg_wer:.2f}%")
            print(f"Average PER: {avg_per:.2f}%")
            print(f"Average ESTOI: {avg_estoi:.2f}%")
            
            results_summary[method_name] = {
                'WER': avg_wer,
                'PER': avg_per,
                'ESTOI': avg_estoi
            }

    print("\nFinal Summary of Results:")
    for method_name, metrics in results_summary.items():
        print(f"\nMethod: {method_name}")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.2f}%")

if __name__ == "__main__":
    main()
       

Using device: cuda
Loading facebook/wav2vec2-base-960h...


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho


Processing method: raw

Method: Raw Audio
Average WER: 12.84%
Average PER: 17.38%
Average ESTOI: 100.00%

Processing method: samosa

Method: SAMOSA
Average WER: 98.28%
Average PER: 79.87%
Average ESTOI: 33.26%

Processing method: synthetic_sensor

Method: Synthetic Sensor
Average WER: 100.82%
Average PER: 83.08%
Average ESTOI: 24.62%

Processing method: privacymic

Method: PrivacyMic
Average WER: 100.00%
Average PER: 100.00%
Average ESTOI: 15.58%

Processing method: coughsense_withphase

Method: CoughSense (with phase)
Average WER: 26.05%
Average PER: 23.95%
Average ESTOI: 80.23%

Processing method: coughsense_withoutphase

Method: CoughSense (without phase)
Average WER: 98.75%
Average PER: 79.80%
Average ESTOI: 17.73%

Processing method: Kirigami_process_Audio

Method: Kirigami
Average WER: 12.84%
Average PER: 17.38%
Average ESTOI: 100.00%

Final Summary of Results:

Method: Raw Audio
WER: 12.84%
PER: 17.38%
ESTOI: 100.00%

Method: SAMOSA
WER: 98.28%
PER: 79.87%
ESTOI: 33.26%

Method