In [None]:
!pip install -q pyannote.audio

[0m

In [None]:
import os
import torch
import torchaudio
import numpy as np

from sklearn.cluster import KMeans
from pyannote.audio import Pipeline, Model, Inference
from scipy.spatial.distance import cosine, cdist, euclidean

In [None]:
%%time

# Determining the device (CUDA or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hugging Face User Access Tokens
AUTH_TOKEN = "🔑🌟 AUTH_TOKEN 🌟🔑"

# Initialization of the diarization pipeline and model for obtaining embeddings
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=AUTH_TOKEN).to(device)

model = Model.from_pretrained("pyannote/embedding", use_auth_token=AUTH_TOKEN).to(device)
embedding_inference = Inference(model)


# Function for computing the embedding from an audio file
def compute_embedding(audio_path):
    embeddings = embedding_inference(audio_path)
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_inference.model.dimension)


min_segment_duration = 5  # Minimum segment duration in seconds


# Function for extracting speaker embeddings from a podcast
def get_speaker_embeddings(podcast_path):
    diarization = diarization_pipeline(podcast_path)
    embeddings = []
    segments = []
    waveform, sample_rate = torchaudio.load(podcast_path)
    waveform = waveform.to(device)

    for segment, _, label in diarization.itertracks(yield_label=True):
        if segment.duration < min_segment_duration:
            continue
        start_time = segment.start
        end_time = segment.end
        segment_embedding = compute_embedding_for_segment(podcast_path, start_time, end_time, sample_rate)
        embeddings.append(segment_embedding)
        segments.append(segment)
    return embeddings, segments


# Function for computing the embedding for a specific segment
def compute_embedding_for_segment(audio_path, start_time, end_time, sample_rate):
    waveform, _ = torchaudio.load(audio_path, frame_offset=int(start_time * sample_rate), num_frames=int((end_time - start_time) * sample_rate))
    waveform = waveform.to(device)
    embeddings = embedding_inference({'waveform': waveform, 'sample_rate': sample_rate})
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_inference.model.dimension)


# Function for extracting embeddings of all audio files in a folder
def get_embeddings_from_folder(folder_path):
    embeddings = []
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(('.wav')):
            file_path = os.path.join(folder_path, file_name)
            embeddings.append(compute_embedding(file_path))
    return embeddings


# Extracting criminal embeddings
criminal_folder_path = "criminals"
criminal_embeddings = get_embeddings_from_folder(criminal_folder_path)


# Clustering of criminal embeddings
def cluster_criminals_embeddings(criminal_embeddings, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(criminal_embeddings)
    return kmeans.cluster_centers_


# Threshold of similarity for the identification of criminals
similarity_threshold = 0.4


# Comparison of the segment with clusters of criminals.
def is_criminal_segment(speaker_embedding, criminal_clusters, threshold=similarity_threshold):
    distances = cdist([speaker_embedding], criminal_clusters, metric="cosine")
    return any(distance < threshold for distance in distances[0])


criminal_clusters = cluster_criminals_embeddings(criminal_embeddings)


# Path to the podcast folder
podcast_folder_path = "podcasts"


# Processing of each podcast in the folder
for file_name in os.listdir(podcast_folder_path):
    if file_name.lower().endswith(('.wav')):
        podcast_path = os.path.join(podcast_folder_path, file_name)
        speaker_embeddings, segments = get_speaker_embeddings(podcast_path)
        criminal_found = False

        for speaker_embedding, segment in zip(speaker_embeddings, segments):
            if is_criminal_segment(speaker_embedding, criminal_clusters):
                criminal_found = True
                break  # Stop further scanning of this file

        if criminal_found:
            print(f"Criminal detected in the file {file_name}")


Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/c6335d8f1cd77b30084387468a6cf26fea90009b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.0+cu118. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.0+cu118. Bad things might happen unless you revert torch to 1.x.
Зловмисник виявлений у файлі vik_roz_720_180.wav
Зловмисник виявлений у файлі vik_roz_540_180.wav
Зловмисник виявлений у файлі vik_roz_360_180.wav
Зловмисник виявлений у файлі vik_roz_180_180.wav
Зловмисник виявлений у файлі vik_roz_0_150.wav
Зловмисник виявлений у файлі yarema_dukh_490_180.wav
Зловмисник виявлений у файлі yarema_dukh_900_180.wav
Зловмисник виявлений у файлі yarema_dukh_720_180.wav
Зловмисник виявлений у файлі yarema_dukh_1080_180.wav
Зловмисник виявлений у файлі yarema_dukh_0_180.wav
З