In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
class VoiceDataset(Dataset):
    def __init__(self, feature_folder):
        self.feature_folder = feature_folder
        self.data = []
        self._prepare_data()

    def _prepare_data(self):
        # Collect features for each speaker
        speaker_dict = {}
        for root, _, files in os.walk(self.feature_folder):
            for file_name in files:
                if file_name.endswith('.npy'):
                    speaker = os.path.basename(root)
                    if speaker not in speaker_dict:
                        speaker_dict[speaker] = []
                    speaker_dict[speaker].append(os.path.join(root, file_name))

        # Create Positive and Negative Pairs
        speakers = list(speaker_dict.keys())
        for speaker in speakers:
            files = speaker_dict[speaker]

            # Positive Pairs (Same Speaker)
            for i in range(len(files)):
                for j in range(i + 1, len(files)):
                    self.data.append((files[i], files[j], 1))

            # Negative Pairs (Different Speakers)
            other_speakers = [s for s in speakers if s != speaker]
            for file in files:
                neg_speaker = np.random.choice(other_speakers)
                neg_file = np.random.choice(speaker_dict[neg_speaker])
                self.data.append((file, neg_file, 0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file1, file2, label = self.data[idx]
        emb1 = np.load(file1)
        emb2 = np.load(file2)
        return torch.tensor(emb1, dtype=torch.float32), torch.tensor(emb2, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# Path to Features
feature_folder = r"C:\Users\LENOVO\Desktop\RP\features"
dataset = VoiceDataset(feature_folder)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print(f"Total Pairs: {len(dataset)}")

Total Pairs: 450


In [4]:
import torch.nn.functional as F

class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0.3),  # Regularization
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
        )
        self.out = nn.Linear(128, 1)

    def forward(self, input1, input2):
        # Extract embeddings
        output1 = self.fc(input1)
        output2 = self.fc(input2)

        # Compute absolute difference
        diff = torch.abs(output1 - output2)

        # Predict similarity using Sigmoid activation
        score = torch.sigmoid(self.out(diff))
        return score


In [5]:
model = SiameseNetwork()
print(model)

SiameseNetwork(
  (fc): Sequential(
    (0): Linear(in_features=768, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): ReLU()
  )
  (out): Linear(in_features=128, out_features=1, bias=True)
)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [7]:
# Training Loop
def train_model(model, dataloader, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for emb1, emb2, labels in dataloader:
            emb1, emb2, labels = emb1.to(device), emb2.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = model(emb1, emb2).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader)}")

# Train the Model
train_model(model, dataloader)

Epoch 1/20, Loss: 0.67899569272995
Epoch 2/20, Loss: 0.6565511107444764
Epoch 3/20, Loss: 0.6459973017374675
Epoch 4/20, Loss: 0.644105064868927
Epoch 5/20, Loss: 0.6505756775538126
Epoch 6/20, Loss: 0.6286508917808533
Epoch 7/20, Loss: 0.6271797935167949
Epoch 8/20, Loss: 0.6266704638799031
Epoch 9/20, Loss: 0.6412771344184875
Epoch 10/20, Loss: 0.6380369385083516
Epoch 11/20, Loss: 0.6257404406865438
Epoch 12/20, Loss: 0.6200477162996928
Epoch 13/20, Loss: 0.6264653186003367
Epoch 14/20, Loss: 0.6506966312726339
Epoch 15/20, Loss: 0.6257165312767029
Epoch 16/20, Loss: 0.6246276080608368
Epoch 17/20, Loss: 0.6511308670043945
Epoch 18/20, Loss: 0.6450962543487548
Epoch 19/20, Loss: 0.6293618897596995
Epoch 20/20, Loss: 0.6383627017339071


In [8]:
torch.save(model.state_dict(), "voice_matching_model.pth")
print("Model saved successfully!")

Model saved successfully!


In [10]:
# Get a single batch from dataloader
sample_batch = next(iter(dataloader))
print(f"Batch length: {len(sample_batch)}")  # Should print 3 (emb1, emb2, labels)

Batch length: 3


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def evaluate_model(model, dataloader):
    model.eval()
    true_labels = []
    pred_scores = []

    with torch.no_grad():
        for emb1, emb2, labels in dataloader:
            emb1, emb2, labels = emb1.to(device), emb2.to(device), labels.to(device)  

            outputs = model(emb1, emb2).squeeze()

            true_labels.extend(labels.cpu().numpy())
            pred_scores.extend(outputs.cpu().numpy())

    # Compute precision-recall curve
    precisions, recalls, thresholds = precision_recall_curve(true_labels, pred_scores)

    # Find best threshold where precision & recall are balanced
    best_threshold = thresholds[(precisions + recalls).argmax() // 2]
    print(f"Best Threshold: {best_threshold}")

    predictions = [1 if score > best_threshold else 0 for score in pred_scores]

    # Calculate Metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    auc = roc_auc_score(true_labels, pred_scores)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"AUC Score: {auc:.4f}")

# Run evaluation on the test dataset
evaluate_model(model, dataloader)

Best Threshold: 0.5071067214012146
Accuracy: 0.6711
Precision: 0.6696
Recall: 1.0000
AUC Score: 0.4982


In [24]:
import numpy as np

def predict_similarity(model, audio1_path, audio2_path):
    model.eval()
    
    # Load audio embeddings
    emb1 = np.load(audio1_path)
    emb2 = np.load(audio2_path)

    # Convert to tensors
    emb1 = torch.tensor(emb1, dtype=torch.float32).unsqueeze(0).to(device)
    emb2 = torch.tensor(emb2, dtype=torch.float32).unsqueeze(0).to(device)

    # Get similarity score
    with torch.no_grad():
        similarity = model(emb1, emb2).item()

    print(f"Similarity Score: {similarity:.4f}")
    return similarity

# Example Usage
predict_similarity(model, r"C:\Users\LENOVO\Desktop\RP\features\English\A01_M\en_A01_M_01.npy", r"C:\Users\LENOVO\Desktop\RP\features\English\A09_F\en_A09_F_02.npy")

Similarity Score: 0.5762


0.5762473344802856