In [11]:
from pyannote.core import Annotation
from pyannote.pipeline import Optimizer
from pyannote.metrics.detection import DetectionErrorRate
from pyannote.database import registry, get_protocol, FileFinder
from pyannote.audio.tasks import VoiceActivityDetection
from pyannote.audio.models.segmentation import PyanNet
from pyannote.audio import Inference
from pyannote.audio.pipelines import VoiceActivityDetection as VoiceActivityDetectionPipeline
from utils import Get_RTTM
import pytorch_lightning as pl
import os

In [12]:
# Load database and set environment variable
registry.load_database(
    os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets', 'yaml','My_Databases.yml')
)
os.environ["PYANNOTE_DATABASE_CONFIG"] = os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets','yaml', 'My_Databases.yml')

'My_datasets.SpeakerDiarization.Detection' found in /mnt/e/Files/Acoustic_Data/Datasets/yaml/My_Databases.yml does not define the 'scope' of speaker labels (file, database, or global). Setting it to 'file'.


In [13]:
# Get protocol and initial training file
preprocessors = {"audio": FileFinder()}
cow_audio = get_protocol('My_datasets.SpeakerDiarization.Detection', preprocessors=preprocessors)
first_training_file = next(cow_audio.train())

In [14]:
# Train the VAD model
vad = VoiceActivityDetection(cow_audio, duration=1, batch_size=16)
model = PyanNet(sincnet={'stride': 10}, task=vad)
model.to("cuda")
output_directory = os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets')
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=5, default_root_dir=output_directory)
trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /mnt/e/Files/Acoustic_Data/Datasets/lightning_logs


Protocol My_datasets.SpeakerDiarization.Detection does not precompute the output of torchaudio.info(): adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. See pyannote.database documentation on how to do that yourself.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type             | Params | In sizes      | Out sizes                                 
--------------------------------------------------------------------------------------------------------------------
0 | sincnet           | SincNet          | 42.6 K | [1, 1, 16000] | [1, 60, 56]                               
1 | lstm              | LSTM             | 589 K  | [1, 56, 60]   | [[1, 56, 256], [[4, 1, 128], [4, 1, 128]]]
2 | linear            | ModuleList       | 49.4 K | ?             | ?                                         
3 | classifier        | Linear           | 129    | [1, 56, 128]  | [1, 56, 1]                                
4 | activation        | Sigmoid          | 0      | [1, 56, 1]    | [1, 56, 1]                                
5 | validation_metric | MetricCollection | 0      | ?             | ?                                         
---------------------------------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [23]:
# Use the trained model for inference
test_file = next(cow_audio.test())
inference = Inference(model)
vad_probability = inference(test_file)

In [27]:
# Convert output to timeline and then to RTTM format
pipeline = VoiceActivityDetectionPipeline(segmentation=model)
initial_params = {"onset": 0.3, "offset": 0.2, "min_duration_on": 0.0, "min_duration_off": 0.0}
pipeline.instantiate(initial_params)
timeline = pipeline(test_file).get_timeline()

In [28]:
metric = DetectionErrorRate()

for file in cow_audio.test():
    
    # apply the voice activity detection pipeline
    speech = pipeline(file)

    # evaluate its output
    _ = metric(
        file['annotation'],     # this is the reference annotation
        speech,                 # this is the hypothesized annotation
        uem=file['annotated'])  # this is the part of the file that should be evaluated
    
# aggregate the performance over the whole test set
detection_error_rate = abs(metric)
print(f'Detection error rate = {detection_error_rate * 100:.1f}%')

Detection error rate = 39.8%


In [25]:
# Save the RTTM before parameter optimization
Get_RTTM.save_rttm(test_file, pipeline, os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets', 'output_rttms','before_optimization.rttm'))

In [26]:
# Optimize parameters
optimizer = Optimizer(pipeline)
optimizer.tune(list(cow_audio.development()), 
               warm_start=initial_params, 
               n_iterations=20, 
               show_progress=False)
optimized_params = optimizer.best_params
print(optimized_params)

pipeline.instantiate(optimized_params)

{'onset': 0.8103251822269171, 'offset': 0.6810206947236057, 'min_duration_on': 0.21402324390136784, 'min_duration_off': 0.12266215715079959}


<pyannote.audio.pipelines.voice_activity_detection.VoiceActivityDetection at 0x7f1e0ac05880>

In [19]:
# Save the RTTM after parameter optimization
Get_RTTM.save_rttm(test_file, pipeline, os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets', 'output_rttms','after_optimization.rttm'))

In [22]:
from utils import TXT_RTTM_Transform
input_file = os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets', 'output_rttms','after_optimization.rttm')
output_file = os.path.join('/mnt/', 'e', 'Files', 'Acoustic_Data', 'Datasets', 'output_rttms','after_optimization.txt')
TXT_RTTM_Transform.rttm_to_txt(input_file, output_file)