# control policy 1: synthetic unknown speakers (σ = 0.3)

In [None]:
import os
import numpy as np

libritts_path = "./unknown_embedding_folder"
train3_path = "./target_embedding_folder"
output_file = "./cosine_sim_unknown_synspk.txt"

def load_embeddings_from_folder(folder_path):
    embeddings = []
    for file in os.listdir(folder_path):
        if file.endswith(".npy"):
            embedding = np.load(os.path.join(folder_path, file))
            embeddings.append(embedding)
    return np.array(embeddings)

def cosine_similarity(vec1, vec2):
    vec1_norm = vec1 / np.linalg.norm(vec1)
    vec2_norm = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1_norm, vec2_norm)

target_ids = [tid for tid in os.listdir(train3_path) if os.path.isdir(os.path.join(train3_path, tid))]

target_mean_embeddings = {}
for target_id in target_ids:
    target_folder = os.path.join(train3_path, target_id)
    target_embeddings = load_embeddings_from_folder(target_folder)
    if len(target_embeddings) > 0:
        target_mean_embeddings[target_id] = np.mean(target_embeddings, axis=0)

with open(output_file, "w") as f:
    
    f.write("Speaker_ID\t" + "\t".join(target_ids) + "\n")
    
    for speaker_id in os.listdir(libritts_path):
        speaker_folder = os.path.join(libritts_path, speaker_id)
        if not os.path.isdir(speaker_folder):
            continue
        
        speaker_embeddings = load_embeddings_from_folder(speaker_folder)
        if len(speaker_embeddings) == 0:
            continue
        speaker_embedding = speaker_embeddings[0]
        
        # cosine similarity
        similarity_scores = []
        for target_id in target_ids:
            if target_id in target_mean_embeddings:
                similarity_score = cosine_similarity(speaker_embedding, target_mean_embeddings[target_id])
                similarity_scores.append(f"{similarity_score:.4f}")
            else:
                similarity_scores.append("N/A")
        
        f.write(f"{speaker_id}\t" + "\t".join(similarity_scores) + "\n")


In [None]:
import pandas as pd

input_file = "./cosine_sim_unknown_synspk.txt"

df = pd.read_csv(input_file, sep="\t")
df['Average_Score'] = df.iloc[:, 1:].mean(axis=1) 

filtered_df = df[df['Average_Score'] < 0.3]

selected_speakers = []
target_scores = filtered_df.iloc[:, 1:-1]

picked_speakers = set()

while len(selected_speakers) < 12:
    for target in target_scores.columns:
        if len(selected_speakers) >= 12:
            break
        
        target_sorted = filtered_df.sort_values(by=target)
        
        for _, row in target_sorted.iterrows():
            if row['Speaker_ID'] not in picked_speakers:
                selected_speakers.append({
                    "Speaker_ID": int(row['Speaker_ID']), 
                    "Target": target,
                    "Score": row[target]
                })
                picked_speakers.add(row['Speaker_ID'])
                break
        
        for _, row in target_sorted.iloc[::-1].iterrows():
            if row['Speaker_ID'] not in picked_speakers:
                selected_speakers.append({
                    "Speaker_ID": int(row['Speaker_ID']),  
                    "Target": target,
                    "Score": row[target]
                })
                picked_speakers.add(row['Speaker_ID'])
                break

result_df = pd.DataFrame(selected_speakers)

speaker_list = result_df['Speaker_ID'].astype(str).tolist()
formatted_speaker_list = ", ".join([f"'{s}'" for s in speaker_list])

print("Selected Speakers:")
print(result_df)

print("\nFormatted Speaker List:")
print(f"[{formatted_speaker_list}]")


# control policy 2: nearest-N target synthetic samples (N=3)

In [None]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch

gptsovits_emb_dir = "./target_spk_syn_emb"
target_emb_dir = "./target_spk_enroll_emb"
output_file = "./cosine_sim_targetsyn.txt"

def load_target_embeddings(target_emb_dir):
    target_means = []
    target_ids = []

    for target_id in sorted(os.listdir(target_emb_dir)): 
        target_path = os.path.join(target_emb_dir, target_id)
        if os.path.isdir(target_path):
            embeddings = []
            for file in sorted(os.listdir(target_path)): 
                if file.endswith(".npy"):
                    emb = np.load(os.path.join(target_path, file))
                    # emb = torch.load(os.path.join(target_path, file))
                    embeddings.append(emb)
            if embeddings:
                target_means.append(np.mean(embeddings, axis=0)) 
                target_ids.append(target_id)

    return target_ids, np.array(target_means)

def load_speaker_embeddings(gptsovits_emb_dir):
    speaker_embeddings = []  
    for speaker_id in sorted(os.listdir(gptsovits_emb_dir)): 
        speaker_path = os.path.join(gptsovits_emb_dir, speaker_id)
        if os.path.isdir(speaker_path):
            for file in sorted(os.listdir(speaker_path)): 
                if file.endswith(".npy"):
                    emb = np.load(os.path.join(speaker_path, file))
                    speaker_embeddings.append((speaker_id, file.replace(".npy", ""), emb))
    return speaker_embeddings

def calculate_similarity(speaker_embeddings, target_ids, target_means):
    results = [] 

    for speaker_id, emb_id, emb in speaker_embeddings:
        similarities = cosine_similarity([emb], target_means)[0] 
        results.append((speaker_id, emb_id, similarities))

    return results

def save_results(output_file, target_ids, results):
    with open(output_file, "w") as f:
        f.write("Speaker_ID\tEmbed_ID\t" + "\t".join(target_ids) + "\n")

        for speaker_id, emb_id, similarities in results:
            similarity_str = "\t".join(f"{sim:.4f}" for sim in similarities)
            f.write(f"{speaker_id}\t{emb_id}\t{similarity_str}\n")

if __name__ == "__main__":
    print("Loading target embeddings...")
    target_ids, target_means = load_target_embeddings(target_emb_dir)

    print("Loading speaker embeddings...")
    speaker_embeddings = load_speaker_embeddings(gptsovits_emb_dir)

    print("Calculating cosine similarities...")
    results = calculate_similarity(speaker_embeddings, target_ids, target_means)

    print(f"Saving results to {output_file}...")
    save_results(output_file, target_ids, results)

    print("✅ All cosine similarities have been calculated and saved.")


In [None]:
import pandas as pd

input_file = "./cosine_sim_targetsyn.txt"
df = pd.read_csv(input_file, sep="\t")

df["Speaker_ID"] = df["Speaker_ID"].astype(str)
df.iloc[:, 2:] = df.iloc[:, 2:].apply(pd.to_numeric, errors="coerce")

def find_top_n_sentences(df, speaker_id, target_col, n=3):
    speaker_data = df[df["Speaker_ID"] == speaker_id]
    if speaker_data.empty:
        print(f"No matching rows for Speaker_ID {speaker_id}")
        return pd.DataFrame()

    if target_col not in df.columns:
        print(f"Column {target_col} not found in dataframe!")
        return pd.DataFrame()

    speaker_data[target_col] = pd.to_numeric(speaker_data[target_col], errors='coerce')
    
    top_n = speaker_data.nlargest(n, target_col)
    return top_n[["Embed_ID", target_col]]

# target synthetic samples

queries = [
    {"speaker_id": "0", "target_col": "0", "top_n": 3},
    {"speaker_id": "1", "target_col": "1", "top_n": 3},
    {"speaker_id": "2", "target_col": "2", "top_n": 3},
    {"speaker_id": "3", "target_col": "3", "top_n": 3},
    {"speaker_id": "4", "target_col": "4", "top_n": 3},
]

for query in queries:
    speaker_id = query["speaker_id"]
    target_col = query["target_col"]
    top_n = query["top_n"]
    
    print(f"Top {top_n} sentences for {speaker_id} on target {target_col}:")
    top_sentences = find_top_n_sentences(df, speaker_id, target_col, top_n)
    print(top_sentences)
    print("-" * 40)
