In [None]:
import os
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
from dotenv import load_dotenv
from tqdm import tqdm
from urllib.parse import quote
from sklearn.cluster import DBSCAN
import numpy as np
import torchvision.transforms as T
from insightface.app import FaceAnalysis
from insightface.utils.face_align import norm_crop
import pillow_heif
from scipy.spatial.distance import cosine
import cv2

# -------------------------------
# Blurry detection
# -------------------------------
def is_blurry(pil_img, threshold=50):
    img = np.array(pil_img.convert("L"))  # grayscale
    variance = cv2.Laplacian(img, cv2.CV_64F).var()
    return variance < threshold

# -------------------------------
# load token from .env
# -------------------------------
load_dotenv()
ACCESS_TOKEN = os.getenv("GRAPH_ACCESS_TOKEN")
if not ACCESS_TOKEN:
    raise RuntimeError("GRAPH_ACCESS_TOKEN not found!")

HEADERS = {"Authorization": f"Bearer {ACCESS_TOKEN}"}

# -------------------------------
# InsightFace setup
# -------------------------------
app = FaceAnalysis(name='buffalo_l')
app.prepare(ctx_id=0, det_size=(640, 640))

# -------------------------------
# Helper functions
# -------------------------------
def is_image_item(item):
    if "file" not in item:
        return False
    mime = item.get("file", {}).get("mimeType", "")
    name = item.get("name", "").lower()
    has_url = "@microsoft.graph.downloadUrl" in item
    ext_ok = name.endswith((".jpg",".jpeg",".png",".heic",".heif"))
    return has_url and (mime.startswith("image/") or ext_ok)

def list_photos_in_folder(folder_path):
    encoded_path = quote(folder_path)
    url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{encoded_path}:/children"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    items = resp.json().get("value", [])
    photos = [f for f in items if is_image_item(f)]
    return photos

def load_photo_image(photo):
    download_url = photo["@microsoft.graph.downloadUrl"]
    r = requests.get(download_url, timeout=30)
    r.raise_for_status()
    data = r.content
    content_type = r.headers.get("Content-Type", "").lower()
    name = photo.get("name", "").lower()
    try:
        img = Image.open(BytesIO(data))
        img = img.convert("RGB")
        return img
    except UnidentifiedImageError:
        if ("heic" in name or "heif" in name or "heic" in content_type or "heif" in content_type):
            try:
                heif = pillow_heif.read_heif(data)
                img = heif.to_pillow().convert("RGB")
                return img
            except Exception:
                return None
        return None

# -------------------------------
# Face preprocessing
# -------------------------------
face_transform = T.Compose([
    T.Resize((256, 256)),
    T.ToTensor(),
    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

def detect_and_align_faces(img_pil, size=256):
    img_np = np.array(img_pil)[:, :, ::-1]
    faces = app.get(img_np)
    results = []
    for face in faces:
        kps = face.kps
        aligned_bgr = norm_crop(img_np, kps, image_size=size, mode='arcface')
        aligned_rgb = aligned_bgr[:, :, ::-1]
        aligned_pil = Image.fromarray(aligned_rgb)

        # Filter: skip blurry or too small faces
        if aligned_pil.width < 50 or aligned_pil.height < 50:
            continue
        if is_blurry(aligned_pil):
            continue

        emb = face.embedding
        emb = emb / (np.linalg.norm(emb) + 1e-12)

        results.append({
            "aligned_pil": aligned_pil,
            "embedding": emb,
        })
    return results

def save_face_image(pil_img, out_path):
    tensor = face_transform(pil_img)
    tensor = tensor * 0.5 + 0.5
    tensor = tensor.clamp(0, 1)
    pil_out = T.ToPILImage()(tensor)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    pil_out.save(out_path, format="JPEG", quality=95)

# -------------------------------
# Cluster merge helper
# -------------------------------
# If too many clusters are merged, decrease threshold (e.g. 0.22), otherwise increase
#
def merge_close_clusters(centroids, threshold=0.25):
    merged = {}
    used = set()
    for i in range(len(centroids)):
        if i in used:
            continue
        merged[i] = [i]
        for j in range(i + 1, len(centroids)):
            if cosine(centroids[i], centroids[j]) < threshold:
                merged[i].append(j)
                used.add(j)
    return merged

# -------------------------------
# Main loop
# -------------------------------
folders = ["Bilder/Eigene Aufnahmen/2023"]

ML_DIR = "../ml_photos"
os.makedirs(ML_DIR, exist_ok=True)

all_faces = []
meta = []

for folder in folders:
    photos = list_photos_in_folder(folder)
    for photo in tqdm(photos):
        img_pil = load_photo_image(photo)
        if img_pil is None:
            continue
        faces = detect_and_align_faces(img_pil)
        for idx_face, f in enumerate(faces):
            all_faces.append(f["embedding"])
            meta.append({
                "aligned_pil": f["aligned_pil"],
                "photo_name": photo["name"],
                "face_index": idx_face,
            })

if len(all_faces) == 0:
    print("No faces found.")
else:
    X = np.stack(all_faces, axis=0)

    # ---- DBSCAN (Cosine) ----
    clustering = DBSCAN(eps=0.35, min_samples=3, metric="cosine").fit(X)
    labels = clustering.labels_

    # ---- Cluster centroids ----
    cluster_ids = sorted(set(labels) - {-1})
    centroids = []
    for cid in cluster_ids:
        vecs = X[labels == cid]
        c = vecs.mean(axis=0)
        c = c / (np.linalg.norm(c) + 1e-12)
        centroids.append(c)

    # ---- Merge clusters ----
    merged = merge_close_clusters(centroids, threshold=0.25)

    # ---- Mapping old â†’ new ----
    cluster_to_person = {}
    for new_id, old_ids in enumerate(merged.values()):
        for old_id in old_ids:
            cluster_to_person[cluster_ids[old_id]] = new_id

    new_labels = []
    for lbl in labels:
        if lbl == -1:
            new_labels.append(-1)
        else:
            new_labels.append(cluster_to_person[lbl])
    new_labels = np.array(new_labels)

    # ---- Save faces ----
    counters = {}
    for i, label in enumerate(new_labels):
        if label == -1:
            person_dir = os.path.join(ML_DIR, "unknown")
        else:
            person_dir = os.path.join(ML_DIR, f"person_{label:02d}")

        counters.setdefault(person_dir, 0)
        counters[person_dir] += 1
        idx = counters[person_dir]

        base = meta[i]["photo_name"]
        out_name = f"{os.path.splitext(base)[0]}_face{meta[i]['face_index']:02d}_{idx:04d}.jpg"
        out_path = os.path.join(person_dir, out_name)

        save_face_image(meta[i]["aligned_pil"], out_path)

In [None]:
import os
import shutil

# CONFIG: Set your own person folder
MY_PERSON_FOLDER = "person_00" 

ML_DIR = "../ml_photos"

def reorganize_ml_photos(ml_dir, my_folder):
    me_path = os.path.join(ml_dir, my_folder)
    me_target = os.path.join(ml_dir, "me")
    if os.path.exists(me_path):
        if os.path.exists(me_target):
            shutil.rmtree(me_target)
        os.rename(me_path, me_target)
        print(f"Renamed {my_folder} to 'me'")
    others_target = os.path.join(ml_dir, "others")
    if not os.path.exists(others_target):
        os.makedirs(others_target)
    for folder in os.listdir(ml_dir):
        folder_path = os.path.join(ml_dir, folder)
        if folder in ["me", "others"] or not os.path.isdir(folder_path):
            continue
        if folder == "unknown":
            shutil.rmtree(folder_path)
            print(f"Deleted folder 'unknown'")
            continue
        for file in os.listdir(folder_path):
            src = os.path.join(folder_path, file)
            dst = os.path.join(others_target, file)
            shutil.move(src, dst)
        shutil.rmtree(folder_path)
        print(f"Moved all images from {folder} to 'others' and removed {folder}")

reorganize_ml_photos(ML_DIR, MY_PERSON_FOLDER)