In [4]:
import os
from pyannote.database import FileFinder
from pyannote.audio.tasks import SpeakerEmbedding
from pyannote.audio.models.segmentation import PyanNet
from pytorch_lightning import Trainer
from types import MethodType
from torch.optim import SGD
from torch.optim.lr_scheduler import ExponentialLR

# Define the path to your dataset folders
train_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/train'
test_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/test'
val_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/val'

# Function to extract speaker ID from filename
def get_speaker_id(filename):
    return filename.split('_')[2]

# Load data from train folder
train_data = []
for filename in os.listdir(train_folder):
    if filename.endswith('.wav'):
        speaker_id = get_speaker_id(filename)
        audio_path = os.path.join(train_folder, filename)
        train_data.append((audio_path, speaker_id))

# Similar loading for test and val folders
# ...

# Define the task as speaker embedding
task = SpeakerEmbedding()

# Load a pretrained model (e.g., PyanNet)
model = PyanNet(task=task)

# Define optimizer and scheduler
def configure_optimizers(self):
    return {"optimizer": SGD(self.parameters()),
            "lr_scheduler": ExponentialLR(optimizer, 0.9)}

model.configure_optimizers = MethodType(configure_optimizers, model)

# Define a Trainer
trainer = Trainer()

# Fine-tune the model
trainer.fit(model)

# Now, let's assume you have obtained embeddings for test and validation sets
# embeddings_test and embeddings_val should be a list of arrays,
# where each array contains embeddings for 5 .wav files of a speaker
# For example, embeddings_test[i] contains embeddings for the i-th speaker in the test set
# and embeddings_val[j] contains embeddings for the j-th speaker in the validation set

# Function to calculate mean embedding for each speaker
def calculate_mean_embedding(embeddings):
    return np.mean(embeddings, axis=0)

# Function to evaluate performance
def evaluate_performance(embeddings_test, embeddings_val, threshold=0.9):
    # Calculate mean embeddings for each speaker in the test set
    mean_embeddings_test = [calculate_mean_embedding(embeddings) for embeddings in embeddings_test]
    
    # Calculate mean embeddings for each speaker in the validation set
    mean_embeddings_val = [calculate_mean_embedding(embeddings) for embeddings in embeddings_val]
    
    # Calculate cosine similarity between mean embeddings for each speaker in the test set
    similarities_test = cosine_similarity(mean_embeddings_test, mean_embeddings_test)
    
    # Calculate cosine similarity between mean embeddings for each speaker in the validation set
    similarities_val = cosine_similarity(mean_embeddings_val, mean_embeddings_val)
    
    # Evaluate performance on the test set
    correct_test = np.sum(similarities_test >= threshold)
    total_speakers_test = len(embeddings_test)
    accuracy_test = (correct_test / total_speakers_test) * 100
    
    # Evaluate performance on the validation set
    correct_val = np.sum(similarities_val >= threshold)
    total_speakers_val = len(embeddings_val)
    accuracy_val = (correct_val / total_speakers_val) * 100
    
    return accuracy_test, accuracy_val

# Example usage:
# Evaluate performance using embeddings_test and embeddings_val
accuracy_test, accuracy_val = evaluate_performance(embeddings_test, embeddings_val)

# Print evaluation results
print(f"Test Set - Percentage of speakers with all embeddings above threshold: {accuracy_test}%")
print(f"Validation Set - Percentage of speakers with all embeddings above threshold: {accuracy_val}%")


TypeError: __init__() missing 1 required positional argument: 'protocol'

In [4]:
import os
from pyannote.database import FileFinder
from pyannote.audio.tasks import SpeakerEmbedding
from pyannote.audio.models.segmentation import PyanNet
from pytorch_lightning import Trainer
from types import MethodType
from torch.optim import SGD
from torch.optim.lr_scheduler import ExponentialLR

# Define the path to your dataset folders
train_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/train'
test_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/test'
val_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/val'

# Function to extract speaker ID from filename
def get_speaker_id(filename):
    return filename.split('_')[2]

# Load data from train folder
train_data = []
for filename in os.listdir(train_folder):
    if filename.endswith('.wav'):
        speaker_id = get_speaker_id(filename)
        audio_path = os.path.join(train_folder, filename)
        train_data.append((audio_path, speaker_id))

# Similar loading for test and val folders
# ...

from pyannote.database import get_protocol, get_database

# Define file finders for train, test, and val folders
train_finder = FileFinder(train_folder)
test_finder = FileFinder(test_folder)
val_finder = FileFinder(val_folder)

# Initialize registry with the required database
database_name = 'dummy_database'  # Replace 'dummy_database' with the actual database name
registry = {'dummy_database': get_database(database_name)}  # Fetch database from PyAnnote
train_finder.registry = registry
test_finder.registry = registry
val_finder.registry = registry

# Create protocols for train, test, and val using the dummy file
train_protocol = train_finder(dummy_file)
test_protocol = test_finder(dummy_file)
val_protocol = val_finder(dummy_file)



# Define the task as speaker embedding
task = SpeakerEmbedding(train_protocol)

# Load a pretrained model (e.g., PyanNet)
model = PyanNet(task=task)

# Define optimizer and scheduler
def configure_optimizers(self):
    return {"optimizer": SGD(self.parameters()),
            "lr_scheduler": ExponentialLR(optimizer, 0.9)}

model.configure_optimizers = MethodType(configure_optimizers, model)

# Define a Trainer
trainer = Trainer()

# Fine-tune the model
trainer.fit(model)


ImportError: cannot import name 'get_database' from 'pyannote.database' (/home/vaibh/anaconda3/envs/pyannote/lib/python3.8/site-packages/pyannote/database/__init__.py)

# tried using custom protocol

In [None]:
from pyannote.database import Protocol
from pyannote.audio.tasks import SpeakerEmbedding
import os

class CustomProtocol(Protocol):
    """Custom protocol for the speaker embedding task"""

    def __init__(self, folder_path, subset, database):
        super(CustomProtocol, self).__init__()
        self.folder_path = folder_path
        self.subset = subset
        self.database = database

    def train_iter(self):
        """Iterate over files in the specified subset"""
        subset_path = os.path.join(self.folder_path, self.subset)
        for filename in os.listdir(subset_path):
            if filename.endswith('.wav'):
                uri = os.path.join(subset_path, filename)
                speaker_id = filename.split('_')[2]
                yield {'uri': uri, 'speaker_id': speaker_id, 'database': self.database}

# Define the path to your dataset folder
folder_path = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip'

# Define the subsets
train_subset = 'train'
test_subset = 'test'
val_subset = 'val'

# Specify the database name
database_name = 'your_database_name'

# Create instances of the custom protocol for each subset
train_protocol = CustomProtocol(folder_path, train_subset, database_name)
test_protocol = CustomProtocol(folder_path, test_subset, database_name)
val_protocol = CustomProtocol(folder_path, val_subset, database_name)

# Now you can use these protocols with the SpeakerEmbedding task
seg_train = SpeakerEmbedding(protocol=train_protocol)
seg_test = SpeakerEmbedding(protocol=test_protocol)
seg_val = SpeakerEmbedding(protocol=val_protocol)


# Tried using custom dataset 

In [13]:
import os
import pyannote.database.protocol as protocol

class SpeakerEmbeddingProtocol(protocol.Protocol):
    """Protocol for speaker embedding task"""

    def __init__(self, path):
        super().__init__(uri=path)

    def itertracks(self):
        """Yield pairs of filename and speaker ID"""
        # Implement how to iterate over your dataset files
        # For example, if your files are named with speaker IDs like 'speaker_001.wav':
        for filename in os.listdir(self.uri):
            if filename.endswith('.wav'):
                speaker_id = filename.split('_')[2]  # Extract speaker ID from filename
                yield filename, speaker_id

# Register the protocol
protocol.register_protocol("SpeakerEmbedding", SpeakerEmbeddingProtocol)


import os
import torch
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple
from torch.optim import SGD
from torch.optim.lr_scheduler import ExponentialLR
from pyannote.database import FileFinder
from pyannote.audio.tasks import SpeakerEmbedding
from pyannote.audio.models.segmentation import PyanNet
from types import MethodType
from pyannote.database import get_protocol

class CustomSpeakerEmbeddingDataset(Dataset):
    """Custom dataset class for speaker embedding task"""

    def __init__(self, folder_path: str):
        """
        Initialize the dataset

        Args:
            folder_path (str): Path to the folder containing the dataset
        """
        self.folder_path = folder_path
        self.audio_files = self._load_data()

    def _load_data(self) -> List[Tuple[str, str]]:
        """
        Load data from the dataset folder

        Returns:
            List of tuples containing audio file paths and speaker IDs
        """
        audio_files = []
        for filename in os.listdir(self.folder_path):
            if filename.endswith('.wav'):
                speaker_id = filename.split('_')[2]
                audio_path = os.path.join(self.folder_path, filename)
                audio_files.append((audio_path, speaker_id))
        return audio_files

    def __len__(self) -> int:
        """Get the total number of samples in the dataset"""
        return len(self.audio_files)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
        """
        Get a sample from the dataset

        Args:
            idx (int): Index of the sample

        Returns:
            Tuple containing the audio features tensor and the speaker ID
        """
        audio_path, speaker_id = self.audio_files[idx]
        # Here, you would load and process the audio file to extract features
        # For demonstration purposes, we'll return a random tensor
        audio_features = torch.rand(1, 10)  # Random tensor (replace with actual feature extraction)
        return audio_features, speaker_id


# Define the path to your dataset folders
train_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/train'
test_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/test'
val_folder = '/home/vaibh/Vaani/Speaker_ID/Dataset/shaip/val'

# Define file finders for train, test, and val folders
train_finder = FileFinder(train_folder)
test_finder = FileFinder(test_folder)
val_finder = FileFinder(val_folder)

# Create custom datasets
train_dataset = CustomSpeakerEmbeddingDataset(train_folder)
test_dataset = CustomSpeakerEmbeddingDataset(test_folder)
val_dataset = CustomSpeakerEmbeddingDataset(val_folder)

# Fetch the protocol for the train, test, and val folders
train_protocol = get_protocol(train_finder)
test_protocol = get_protocol(test_finder)
val_protocol = get_protocol(val_finder)

# Define the task as speaker embedding using the train protocol
task = SpeakerEmbedding(train_protocol)

# Load a pretrained model (e.g., PyanNet)
model = PyanNet(task=task)

# Define optimizer and scheduler
def configure_optimizers(self):
    optimizer = SGD(self.parameters(), lr=1e-3)  # Adjust learning rate as needed
    lr_scheduler = ExponentialLR(optimizer, 0.9)
    return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

model.configure_optimizers = MethodType(configure_optimizers, model)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

# Define a Trainer
trainer = Trainer()

# Fine-tune the model
trainer.fit(model, train_dataloader=train_loader)

# Save the trained model
torch.save(model.state_dict(), 'speaker_embedding_model.pt')


AttributeError: module 'pyannote.database.protocol' has no attribute 'register_protocol'