In [None]:
#   Are there clusters detected by DBSCAN?
#     No → NON-HUMAN
#     Yes
#       Is there a face in the profile picture?
#         Yes
#           Does the profile face match exactly one cluster (similarity > threshold)?
#             No → NON-HUMAN
#             Yes
#               Is that cluster dominant (≥ face_ratio AND ≥ cluster_ratio)?
#                 Yes → HUMAN
#                 No → NON-HUMAN
#         No
#           Does any cluster satisfy dominance (≥ face_ratio AND ≥ cluster_ratio)?
#             Yes → HUMAN
#             No → NON-HUMAN

In [None]:
import os
import numpy as np
from sklearn.cluster import DBSCAN
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from ground_truth import GROUND_TRUTH, GROUND_TRUTH_BRANDS, GROUND_TRUTH_CREATORS

COLLAB_DIR = "/home/yash-sisodia/face-detection/collaborators_india/collaborators"

def normalize(vec):
    norm = np.linalg.norm(vec)
    return vec / norm if norm > 0 else vec

eps_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
cos_values = [0.5, 0.6, 0.7, 0.8, 0.9]
face_ratios = [0.3, 0.4]
cluster_ratios = [0.3, 0.4]

grid_results = []

for eps in eps_values:
    for cos in cos_values:
        for fr in face_ratios:
            for cr in cluster_ratios:
                correct = 0
                total = 0

                # looping through all collaborators in gt
                for USERNAME in [u for u in os.listdir(COLLAB_DIR) if u in GROUND_TRUTH]:
                    embeddings_file = os.path.join(COLLAB_DIR, USERNAME, "embeddings_output2.npz")
                    if not os.path.exists(embeddings_file):
                        continue
                    
                    # loading prof and thumb embeddings
                    data = np.load(embeddings_file, allow_pickle=True)
                    prof_emb = data["profile_embeddings"]
                    thumb_emb = data["thumbnail_embeddings"]
                    if thumb_emb.size == 0:
                        continue
                    
                    # normalizing and clustering thumb embeddings
                    thumb_emb = np.array([normalize(e) for e in thumb_emb])
                    clustering = DBSCAN(eps=eps, min_samples=2, metric="cosine").fit(thumb_emb)
                    labels = clustering.labels_

                    # grouping thumb by cluster label
                    clusters = defaultdict(list)
                    for i, label in enumerate(labels):
                        if label != -1:
                            clusters[label].append(thumb_emb[i])

                    prediction = "NON-HUMAN" # default prediction
                    if len(clusters) > 0:
                        # getting centroids of clusters
                        centroids = {cid: np.mean(vectors, axis=0) for cid, vectors in clusters.items()}
                        total_faces = len(thumb_emb)

                        if prof_emb.size > 0:
                            # case where profile image exists
                            prof_vec = normalize(prof_emb[0])
                            sims = {cid: cosine_similarity([prof_vec], [cent])[0,0] for cid, cent in centroids.items()}
                            best = [cid for cid, s in sims.items() if s > cos]
                            if len(best) == 1:
                                cid = best[0]
                                face_ratio = len(clusters[cid]) / total_faces
                                cluster_ratio = len(clusters[cid]) / sum(len(v) for v in clusters.values())
                                if face_ratio >= fr and cluster_ratio >= cr:
                                    prediction = "HUMAN"
                        else:
                            # case where no profile image exists, use cluster dominance
                            for cid, vectors in clusters.items():
                                face_ratio = len(vectors) / total_faces
                                cluster_ratio = len(vectors) / sum(len(v) for v in clusters.values())
                                if face_ratio >= fr and cluster_ratio >= cr:
                                    prediction = "HUMAN"
                                    break
                    
                    # comparing prediction with gt
                    truth = "HUMAN" if USERNAME in GROUND_TRUTH_CREATORS else "NON-HUMAN"
                    if prediction == truth:
                        correct += 1
                    total += 1

                # accuracy for this param set
                acc = correct / total if total else 0
                grid_results.append((eps, cos, fr, cr, acc))
                print(f"eps={eps}, cos={cos}, face_r={fr}, cluster_r={cr} -> acc={acc:.2f}")

best = max(grid_results, key=lambda x: x[4])
print("\nBest params:", best)


In [5]:
false_positives = []
false_negatives = []

for username, predicted in results.items():
    if username in GROUND_TRUTH_CREATORS and predicted == "NON-HUMAN":
        false_negatives.append(username)
    elif username in GROUND_TRUTH_BRANDS and predicted == "HUMAN":
        false_positives.append(username)

print("\nFalse Negatives (Humans => Non-Humans):", false_negatives)
print("\nFalse Positives (Non-Humans => Humans):", false_positives)



False Negatives (Humans => Non-Humans): ['__.anuraaag', '__estrellaaaa.__', '__lil.jannat', '___notyourcupoftea___', '__ishikasachdeva__', '_.nickyy75_', '___antara__ghosh___', '_.mile.y', '__.aaddyyyyyyyy', '_.hett', '__barbie_grl', '___aamu___05', '___dhanu07___', '_.minney__06', '__.mitara.b.__', '___simran__official___', '__.shammu_zz', '_.harshitha_.gowda._', '_.shrush_', '__jess_mavalia__', '__ishannnnn._', '__.soulzy', '___maxumilian___', '__.sannss', '__beauty__squad__', '_.priyanshiii01', '___twinkling___', '__hussainujjain', '__baruah__96', '_.srilakshmi.__']

False Positives (Non-Humans => Humans): ['staywrogn', 'biotique_world', 'bobbibrownindia', 'crayy.heads', 'myskinq', 'siyaramsindia', 'dermabayskincare', 'skinkraftshop', 'maincharacter_india', 'shopaamili', 'coloressenceofficial', 'layrrd', 'auliglow']
