In [56]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from unqlite import UnQLite
from tqdm import tqdm
import numpy as np
import statistics
import pickle
import gdown

In [57]:
gdown.download(id="1oEQijUkn5QerZ3UvaJ1a5N6G7mbDpjeG", output="audio.db")

In [58]:
def retrive_data():
    db, data = UnQLite("audio.db"), []

    with db.cursor() as cursor:
        for key, value in cursor:
            record = pickle.loads(value)
            data.append({
                "filename": key,
                "partition": "",
                "embedding": pickle.loads(record["embedding"]).tolist() # 2048
            })
    return data

In [59]:
raw_data = retrive_data()

In [60]:
NUM_RECORD = 2000
EMBEDDING_LEN = 2048

KMEANS_FLOOR = 2
KMEANS_CEILING = 9
MIN_PARTITION_SIZE = 30

In [61]:

def adaptive_clustering(data, indices, threshold=0, layer=1):
    # partition size is small enough
    if len(indices) < MIN_PARTITION_SIZE:
        return []

    # iterative kmeans
    embeddings, scores = [data[idx]["embedding"] for idx in indices], []
    for n in range(KMEANS_FLOOR, KMEANS_CEILING):
        result = KMeans(n_clusters=n, random_state=0, n_init="auto").fit(embeddings)
        scores.append(silhouette_score(embeddings, result.labels_))

    # check whether need further clutering
    max_score, median_score = max(scores), statistics.median(scores)
    if max_score < threshold:
        return []

    # re-clustring
    k = scores.index(max_score) + 2
    indices_list = [[] for _ in range(k)]
    result = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(embeddings)

    # concate partition name
    for index, label in zip(indices, result.labels_.tolist()):
        indices_list[label].append(index)
        data[index]["partition"] += f"{str(label)}_"

    # recursive clustering
    centroids = []
    for indices in indices_list:
        centroids.append({
            "embedding": (np.mean([data[idx]["embedding"] for idx in indices], axis=0)).tolist(),
            "nexts": adaptive_clustering(data, indices, median_score * THRESHOLD_GROTH_RATE, layer + 1)
        })

    return centroids

In [62]:
def get_partition(embedding, centroids):
    max_idx, max_score = 0, 0
    for idx, centroid in enumerate(centroids):
        score = cosine_similarity([embedding],[centroid["embedding"]])
        if score > max_score:
            max_score = score
            max_idx = idx

    partition = f"{max_idx}_"
    if centroids[max_idx]["nexts"] != []:
        partition += get_partition(embedding, centroids[max_idx]["nexts"])

    return partition

In [63]:
partition_embeddings = []
def parse(centroid):
    next = centroid["nexts"]
    if next == []:
        partition_embeddings.append(centroid["embedding"])
        return 1

    cnt = 0
    for next in centroid["nexts"]:
        cnt += parse(next)
    return cnt

In [64]:
THRESHOLD_GROTH_RATE = 1
centroids = adaptive_clustering(raw_data, range(NUM_RECORD))

partition_cnt = 0
partition_embeddings = []
for centroid in centroids:
    partition_cnt += parse(centroid)

print(partition_cnt)

for record in tqdm(raw_data):
    get_partition(record["embedding"], centroids)

for record in tqdm(raw_data):
    max_idx, max_score = 0, 0
    for idx, embedding in enumerate(partition_embeddings):
        score = cosine_similarity([record["embedding"]], [embedding])
        if score > max_score:
            max_score = score
            max_idx = idx

112


100%|██████████| 2000/2000 [00:24<00:00, 80.02it/s]
100%|██████████| 2000/2000 [02:30<00:00, 13.33it/s]


In [65]:
THRESHOLD_GROTH_RATE = 1.2
centroids = adaptive_clustering(raw_data, range(NUM_RECORD))

partition_cnt = 0
partition_embeddings = []
for centroid in centroids:
    partition_cnt += parse(centroid)

print(partition_cnt)

for record in tqdm(raw_data):
    get_partition(record["embedding"], centroids)

for record in tqdm(raw_data):
    max_idx, max_score = 0, 0
    for idx, embedding in enumerate(partition_embeddings):
        score = cosine_similarity([record["embedding"]], [embedding])
        if score > max_score:
            max_score = score
            max_idx = idx

85


100%|██████████| 2000/2000 [00:15<00:00, 127.04it/s]
100%|██████████| 2000/2000 [01:28<00:00, 22.66it/s]


In [66]:
THRESHOLD_GROTH_RATE = 1.5
centroids = adaptive_clustering(raw_data, range(NUM_RECORD))

partition_cnt = 0
partition_embeddings = []
for centroid in centroids:
    partition_cnt += parse(centroid)

print(partition_cnt)

for record in tqdm(raw_data):
    get_partition(record["embedding"], centroids)

for record in tqdm(raw_data):
    max_idx, max_score = 0, 0
    for idx, embedding in enumerate(partition_embeddings):
        score = cosine_similarity([record["embedding"]], [embedding])
        if score > max_score:
            max_score = score
            max_idx = idx

61


100%|██████████| 2000/2000 [00:15<00:00, 131.12it/s]
100%|██████████| 2000/2000 [01:03<00:00, 31.57it/s]


In [67]:
THRESHOLD_GROTH_RATE = 1.8
centroids = adaptive_clustering(raw_data, range(NUM_RECORD))

partition_cnt = 0
partition_embeddings = []
for centroid in centroids:
    partition_cnt += parse(centroid)

print(partition_cnt)

for record in tqdm(raw_data):
    get_partition(record["embedding"], centroids)

for record in tqdm(raw_data):
    max_idx, max_score = 0, 0
    for idx, embedding in enumerate(partition_embeddings):
        score = cosine_similarity([record["embedding"]], [embedding])
        if score > max_score:
            max_score = score
            max_idx = idx

36


100%|██████████| 2000/2000 [00:11<00:00, 169.20it/s]
100%|██████████| 2000/2000 [00:36<00:00, 54.78it/s]


In [68]:
THRESHOLD_GROTH_RATE = 2
centroids = adaptive_clustering(raw_data, range(NUM_RECORD))

partition_cnt = 0
partition_embeddings = []
for centroid in centroids:
    partition_cnt += parse(centroid)

print(partition_cnt)

for record in tqdm(raw_data):
    get_partition(record["embedding"], centroids)

for record in tqdm(raw_data):
    max_idx, max_score = 0, 0
    for idx, embedding in enumerate(partition_embeddings):
        score = cosine_similarity([record["embedding"]], [embedding])
        if score > max_score:
            max_score = score
            max_idx = idx

36


100%|██████████| 2000/2000 [00:12<00:00, 160.25it/s]
100%|██████████| 2000/2000 [00:36<00:00, 54.40it/s]
