In [None]:
import kagglehub
import numpy as np
import cv2
import dlib
from pathlib import Path
import urllib.request
import bz2
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import time
import itertools

# Télécharger dataset
path = kagglehub.dataset_download("sidharthangn/celebrity-face-dataset-augmented")
print("Dataset:", path)

# Config AMÉLIORÉE
SIZE = 256  # ✅ Plus grande résolution (était 128)
ALPHA_INTRA = 0.5
ALPHA_INTER = 0.5
NUM_INTRA_PER_PERSON = 30
MIN_IMAGES_PER_PERSON = 6

# Dossiers
OUTPUT_DIR = Path("./2stage_morphed_database_HQ")
INTRA_DIR = OUTPUT_DIR / "stage1_intra_morphs"
INTER_DIR = OUTPUT_DIR / "stage2_inter_morphs"

OUTPUT_DIR.mkdir(exist_ok=True)
INTRA_DIR.mkdir(exist_ok=True)
INTER_DIR.mkdir(exist_ok=True)

LOCAL_DATA_DIR = Path("./dlib_models")
LOCAL_DATA_DIR.mkdir(exist_ok=True)
PREDICTOR_PATH = LOCAL_DATA_DIR / "shape_predictor_68_face_landmarks.dat"

# Dlib
if not PREDICTOR_PATH.exists():
    print("Téléchargement Dlib...")
    url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
    compressed = LOCAL_DATA_DIR / "temp.bz2"
    urllib.request.urlretrieve(url, compressed)
    with bz2.BZ2File(compressed, 'rb') as f_in:
        with open(PREDICTOR_PATH, 'wb') as f_out:
            f_out.write(f_in.read())
    compressed.unlink()

detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(str(PREDICTOR_PATH))
print("[OK] Dlib chargé")

In [None]:
persons = defaultdict(list)
for person_dir in Path(path).rglob("*"):
    if person_dir.is_dir():
        images = list(person_dir.glob("*.jpg")) + list(person_dir.glob("*.png"))
        if len(images) >= MIN_IMAGES_PER_PERSON:
            persons[person_dir.name] = [str(img) for img in images]

person_list = list(persons.keys())
n_persons = len(person_list)

print(f"\n[OK] {n_persons} personnes trouvées")

In [None]:
def morph_triangle(img1, img2, img_morphed, t1, t2, t_morphed, alpha):
    r1 = cv2.boundingRect(np.float32([t1]))
    r2 = cv2.boundingRect(np.float32([t2]))
    r = cv2.boundingRect(np.float32([t_morphed]))

    if r1[2] <= 0 or r1[3] <= 0 or r2[2] <= 0 or r2[3] <= 0 or r[2] <= 0 or r[3] <= 0:
        return

    t1_rect = [(t1[i][0] - r1[0], t1[i][1] - r1[1]) for i in range(3)]
    t2_rect = [(t2[i][0] - r2[0], t2[i][1] - r2[1]) for i in range(3)]
    t_rect = [(t_morphed[i][0] - r[0], t_morphed[i][1] - r[1]) for i in range(3)]

    img1_rect = img1[r1[1]:r1[1]+r1[3], r1[0]:r1[0]+r1[2]]
    img2_rect = img2[r2[1]:r2[1]+r2[3], r2[0]:r2[0]+r2[2]]

    if img1_rect.size == 0 or img2_rect.size == 0:
        return

    size_rect = (r[2], r[3])

    warp_img1 = apply_affine_transform(img1_rect, t1_rect, t_rect, size_rect)
    warp_img2 = apply_affine_transform(img2_rect, t2_rect, t_rect, size_rect)

    # ✅ CORRECTION: Blend ADDITIF au lieu de moyenne
    img_rect = (1.0 - alpha) * warp_img1 + alpha * warp_img2
    
    # ✅ BOOST de luminosité sur le triangle morphé
    img_rect = np.clip(img_rect * 1.0, 0, 255)  # Garder les valeurs

    mask = np.zeros((r[3], r[2]), dtype=np.float32)
    cv2.fillConvexPoly(mask, np.int32(t_rect), 1.0, cv2.LINE_AA, 0)

    y, x, w_rect, h_rect = r[1], r[0], r[2], r[3]
    
    # ✅ Composite sans assombrir
    for c in range(3):  # Pour chaque canal BGR
        img_morphed[y:y+h_rect, x:x+w_rect, c] = \
            img_morphed[y:y+h_rect, x:x+w_rect, c] * (1 - mask) + \
            img_rect[:, :, c] * mask

In [None]:
def morph_faces(img_path_a, img_path_b, alpha=0.5):
    """Morphe 2 images - VERSION LUMINEUSE"""
    imgA = cv2.imread(str(img_path_a))
    imgB = cv2.imread(str(img_path_b))
    
    if imgA is None or imgB is None:
        return None
    
    # Resize
    imgA_resized = cv2.resize(imgA, (SIZE, SIZE), interpolation=cv2.INTER_LANCZOS4)
    imgB_resized = cv2.resize(imgB, (SIZE, SIZE), interpolation=cv2.INTER_LANCZOS4)
    
    imgA_gray = cv2.cvtColor(imgA_resized, cv2.COLOR_BGR2GRAY)
    imgB_gray = cv2.cvtColor(imgB_resized, cv2.COLOR_BGR2GRAY)
    
    # ✅ IMPORTANT: Garder en float32 SANS normalisation
    imgA_color = imgA_resized.astype(np.float32)
    imgB_color = imgB_resized.astype(np.float32)

    ptsA = prepare_points_for_image(imgA_gray, SIZE, SIZE)
    ptsB = prepare_points_for_image(imgB_gray, SIZE, SIZE)

    points_morphed = (1.0 - alpha) * ptsA + alpha * ptsB
    points_morphed = clamp_points(points_morphed, SIZE, SIZE)

    rect = (0, 0, SIZE, SIZE)
    subdiv = cv2.Subdiv2D(rect)

    for p in points_morphed:
        x, y = float(p[0]), float(p[1])
        if 0 <= x < SIZE and 0 <= y < SIZE:
            try:
                subdiv.insert((x, y))
            except:
                pass

    triangle_list = subdiv.getTriangleList()

    tri_indices = []
    for t in triangle_list:
        tri_pts = [(t[0], t[1]), (t[2], t[3]), (t[4], t[5])]
        inds = []
        valid = True
        for p in tri_pts:
            idx = find_point_index(points_morphed, p, tol=3.0)
            if idx is None:
                valid = False
                break
            inds.append(idx)
        if valid and len(set(inds)) == 3:
            tri_indices.append(tuple(inds))

    tri_indices = list(set(tri_indices))

    # ✅ INITIALISER avec une image BLANCHE (255) au lieu de NOIRE (0)
    img_morphed = np.full_like(imgA_color, 255.0, dtype=np.float32)

    for tri in tri_indices:
        i1, i2, i3 = tri
        tA = [ptsA[i1], ptsA[i2], ptsA[i3]]
        tB = [ptsB[i1], ptsB[i2], ptsB[i3]]
        tM = [points_morphed[i1], points_morphed[i2], points_morphed[i3]]

        if not (triangle_completely_inside(tA, SIZE, SIZE) and 
                triangle_completely_inside(tB, SIZE, SIZE) and 
                triangle_completely_inside(tM, SIZE, SIZE)):
            continue

        morph_triangle(imgA_color, imgB_color, img_morphed, tA, tB, tM, alpha)

    morph_result = np.clip(img_morphed, 0, 255).astype(np.uint8)
    
    # ✅ Post-traitement SIMPLE et EFFICACE
    # 1. Vérifier luminosité
    mean_brightness = np.mean(cv2.cvtColor(morph_result, cv2.COLOR_BGR2GRAY))
    
    # 2. Corriger si trop sombre
    if mean_brightness < 100:
        factor = 100.0 / max(mean_brightness, 1)
        morph_result = cv2.convertScaleAbs(morph_result, alpha=min(factor, 2.0), beta=0)
    
    # 3. Égalisation adaptative
    morph_lab = cv2.cvtColor(morph_result, cv2.COLOR_BGR2LAB)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    morph_lab[:,:,0] = clahe.apply(morph_lab[:,:,0])
    morph_result = cv2.cvtColor(morph_lab, cv2.COLOR_LAB2BGR)
    
    # Vérification finale
    if np.mean(morph_result) < 10:
        return None
    
    return morph_result

In [None]:
def stage1_intra_morphing():
    """ÉTAPE 1: Morphing INTRA-personne"""
    
    print(f"\n{'='*60}")
    print("ÉTAPE 1 - MORPHING INTRA-PERSONNE (HAUTE QUALITÉ)")
    print(f"{'='*60}\n")
    
    intra_morphs = defaultdict(list)
    stats = {"successful": 0, "failed": 0}
    
    for person_name in tqdm(person_list, desc="Étape 1 - Intra"):
        clean_name = sanitize_name(person_name)
        imgs = persons[person_name]
        
        for n in range(1, NUM_INTRA_PER_PERSON + 1):
            try:
                img_a, img_b = np.random.choice(imgs, size=2, replace=False)
                morph = morph_faces(img_a, img_b, alpha=ALPHA_INTRA)
                
                if morph is not None:
                    filename = f"{clean_name}_intra_{n}.png"
                    filepath = INTRA_DIR / filename
                    # ✅ Sauvegarde avec compression optimale
                    cv2.imwrite(str(filepath), morph, [cv2.IMWRITE_PNG_COMPRESSION, 3])
                    
                    intra_morphs[person_name].append(str(filepath))
                    stats["successful"] += 1
                else:
                    stats["failed"] += 1
                    
            except Exception as e:
                stats["failed"] += 1
    
    print(f"\n[ÉTAPE 1 TERMINÉE]")
    print(f"   - Images générées: {stats['successful']}")
    print(f"   - Échecs: {stats['failed']}")
    
    return intra_morphs, stats

In [None]:
def stage2_inter_morphing(intra_morphs):
    """ÉTAPE 2: Morphing INTER-personnes"""
    
    print(f"\n{'='*60}")
    print("ÉTAPE 2 - MORPHING INTER-PERSONNES (HAUTE QUALITÉ)")
    print(f"{'='*60}\n")
    
    stats = {"successful": 0, "failed": 0}
    person_pairs = list(itertools.combinations(person_list, 2))
    
    print(f"   - Paires: {len(person_pairs)}\n")
    
    for person_a, person_b in tqdm(person_pairs, desc="Étape 2 - Inter"):
        morphs_a = intra_morphs.get(person_a, [])
        morphs_b = intra_morphs.get(person_b, [])
        
        if not morphs_a or not morphs_b:
            continue
        
        clean_a = sanitize_name(person_a)
        clean_b = sanitize_name(person_b)
        
        for i, morph_a in enumerate(morphs_a, 1):
            for j, morph_b in enumerate(morphs_b, 1):
                try:
                    hybrid = morph_faces(morph_a, morph_b, alpha=ALPHA_INTER)
                    
                    if hybrid is not None:
                        filename = f"{clean_a}-{clean_b}_{i}_{j}.png"
                        filepath = INTER_DIR / filename
                        cv2.imwrite(str(filepath), hybrid, [cv2.IMWRITE_PNG_COMPRESSION, 3])
                        stats["successful"] += 1
                    else:
                        stats["failed"] += 1
                        
                except Exception as e:
                    stats["failed"] += 1
    
    print(f"\n[ÉTAPE 2 TERMINÉE]")
    print(f"   - Images générées: {stats['successful']}")
    
    return stats

In [None]:
start_time = time.time()
intra_morphs, stats1 = stage1_intra_morphing()
stats2 = stage2_inter_morphing(intra_morphs)
elapsed = time.time() - start_time

print(f"\n{'='*60}")
print("TERMINÉ - HAUTE QUALITÉ")
print(f"{'='*60}")
print(f"ÉTAPE 1: {stats1['successful']} images")
print(f"ÉTAPE 2: {stats2['successful']} images")
print(f"TOTAL: {stats1['successful'] + stats2['successful']}")
print(f"Temps: {elapsed/60:.1f} min")
print(f"{'='*60}")

In [None]:
import cv2
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from collections import defaultdict
import shutil

# Dossier source
DATA_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\2stage_morphed_database_HQ\stage1_intra_morphs")

print("="*60)
print("DETECTION DES IMAGES DEFORMEES")
print("="*60)

image_files = list(DATA_DIR.glob("*.png")) + list(DATA_DIR.glob("*.jpg"))
print(f"Images a analyser: {len(image_files)}")

def detect_artifacts(img):
    """
    Detecte les artefacts dans une image:
    - Tourbillons
    - Distorsions
    - Zones noires anormales
    - Bords deformes
    """
    scores = {}
    
    # 1. Convertir en grayscale
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img
    
    # 2. Detecter les bords (Canny)
    edges = cv2.Canny(gray, 50, 150)
    edge_density = np.sum(edges > 0) / edges.size
    scores['edge_density'] = edge_density
    
    # 3. Detecter les zones noires anormales
    black_pixels = np.sum(gray < 10) / gray.size
    scores['black_ratio'] = black_pixels
    
    # 4. Variance locale (detecte les distorsions)
    kernel_size = 15
    local_mean = cv2.blur(gray.astype(np.float32), (kernel_size, kernel_size))
    local_sq_mean = cv2.blur((gray.astype(np.float32))**2, (kernel_size, kernel_size))
    local_var = local_sq_mean - local_mean**2
    var_of_var = np.std(local_var)
    scores['var_of_var'] = var_of_var
    
    # 5. Laplacian (detecte les flous et distorsions)
    laplacian = cv2.Laplacian(gray, cv2.CV_64F)
    laplacian_var = np.var(laplacian)
    scores['laplacian_var'] = laplacian_var
    
    # 6. Detecter les lignes anormales (Hough)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 50, minLineLength=30, maxLineGap=10)
    n_lines = len(lines) if lines is not None else 0
    scores['n_lines'] = n_lines
    
    # 7. Symetrie du visage (un visage normal est relativement symetrique)
    h, w = gray.shape
    left_half = gray[:, :w//2]
    right_half = cv2.flip(gray[:, w//2:], 1)
    
    # Ajuster les tailles si necessaire
    min_w = min(left_half.shape[1], right_half.shape[1])
    left_half = left_half[:, :min_w]
    right_half = right_half[:, :min_w]
    
    symmetry_diff = np.mean(np.abs(left_half.astype(np.float32) - right_half.astype(np.float32)))
    scores['asymmetry'] = symmetry_diff
    
    # 8. Gradient magnitude (detecte les transitions brutales)
    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    gradient_mag = np.sqrt(sobelx**2 + sobely**2)
    gradient_max = np.max(gradient_mag)
    gradient_mean = np.mean(gradient_mag)
    scores['gradient_ratio'] = gradient_max / (gradient_mean + 1e-6)
    
    return scores

# Analyser toutes les images
print("\nAnalyse en cours...")
results = []

for filepath in image_files:
    img = cv2.imread(str(filepath))
    if img is None:
        continue
    
    # Redimensionner pour analyse uniforme
    img_resized = cv2.resize(img, (128, 128))
    
    scores = detect_artifacts(img_resized)
    scores['filepath'] = filepath
    scores['filename'] = filepath.name
    
    results.append(scores)

print(f"[OK] {len(results)} images analysees")

In [None]:
import pandas as pd

# Convertir en DataFrame
df = pd.DataFrame(results)

print("="*60)
print("STATISTIQUES DES METRIQUES")
print("="*60)

# Afficher les stats
metrics = ['edge_density', 'black_ratio', 'var_of_var', 'laplacian_var', 'asymmetry', 'gradient_ratio']

for metric in metrics:
    print(f"\n{metric}:")
    print(f"   Min: {df[metric].min():.4f}")
    print(f"   Max: {df[metric].max():.4f}")
    print(f"   Mean: {df[metric].mean():.4f}")
    print(f"   Std: {df[metric].std():.4f}")

# Calculer un score de qualite combine
# Les images deformees ont souvent:
# - Haute edge_density (trop de bords)
# - Haute asymmetry (visage deforme)
# - Haut gradient_ratio (transitions brutales)
# - Haute var_of_var (variance irreguliere)

df['quality_score'] = (
    (df['edge_density'] - df['edge_density'].mean()) / df['edge_density'].std() +
    (df['asymmetry'] - df['asymmetry'].mean()) / df['asymmetry'].std() +
    (df['gradient_ratio'] - df['gradient_ratio'].mean()) / df['gradient_ratio'].std() +
    (df['var_of_var'] - df['var_of_var'].mean()) / df['var_of_var'].std()
)

# Les images avec un score eleve sont suspectes
threshold = df['quality_score'].mean() + 1.5 * df['quality_score'].std()

bad_images = df[df['quality_score'] > threshold]
good_images = df[df['quality_score'] <= threshold]

print(f"\n" + "="*60)
print(f"RESULTAT")
print(f"="*60)
print(f"   - Seuil de qualite: {threshold:.4f}")
print(f"   - Images OK: {len(good_images)}")
print(f"   - Images suspectes: {len(bad_images)}")

In [None]:
# Afficher les 20 images les plus suspectes
worst_images = df.nlargest(20, 'quality_score')

print("="*60)
print("TOP 20 IMAGES LES PLUS SUSPECTES")
print("="*60)

n_cols = 5
n_rows = 4
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 12))
axes = axes.flatten()

for i, (_, row) in enumerate(worst_images.iterrows()):
    if i >= 20:
        break
    
    img = cv2.imread(str(row['filepath']))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (128, 128))
    
    axes[i].imshow(img)
    axes[i].set_title(f"{row['filename'][:15]}\nScore: {row['quality_score']:.2f}", fontsize=8, color='red')
    axes[i].axis('off')

plt.suptitle("Images Suspectes (a verifier)", fontsize=14)
plt.tight_layout()
plt.savefig("suspicious_images.png", dpi=150)
plt.show()

print("\nFichiers suspects:")
for _, row in worst_images.iterrows():
    print(f"   - {row['filename']} (score: {row['quality_score']:.2f})")

In [None]:
# Afficher les 20 meilleures images
best_images = df.nsmallest(20, 'quality_score')

print("="*60)
print("TOP 20 MEILLEURES IMAGES")
print("="*60)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 12))
axes = axes.flatten()

for i, (_, row) in enumerate(best_images.iterrows()):
    if i >= 20:
        break
    
    img = cv2.imread(str(row['filepath']))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (128, 128))
    
    axes[i].imshow(img)
    axes[i].set_title(f"{row['filename'][:15]}\nScore: {row['quality_score']:.2f}", fontsize=8, color='green')
    axes[i].axis('off')

plt.suptitle("Bonnes Images (reference)", fontsize=14)
plt.tight_layout()
plt.savefig("good_images.png", dpi=150)
plt.show()

In [None]:
# Afficher TOUTES les images suspectes pour verification manuelle
print("="*60)
print("VERIFICATION MANUELLE")
print("="*60)
print("Regardez les images et notez celles a supprimer")

# Trier par score decroissant
suspicious = df[df['quality_score'] > df['quality_score'].median()].sort_values('quality_score', ascending=False)

# Afficher par lots de 25
n_per_page = 25
n_pages = (len(suspicious) + n_per_page - 1) // n_per_page

for page in range(min(3, n_pages)):  # Afficher les 3 premieres pages
    start = page * n_per_page
    end = min(start + n_per_page, len(suspicious))
    
    subset = suspicious.iloc[start:end]
    
    n_rows = 5
    n_cols = 5
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 15))
    axes = axes.flatten()
    
    for i, (_, row) in enumerate(subset.iterrows()):
        img = cv2.imread(str(row['filepath']))
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (128, 128))
            axes[i].imshow(img)
            axes[i].set_title(f"{row['filename'][:20]}\n{row['quality_score']:.1f}", fontsize=7)
        axes[i].axis('off')
    
    for i in range(len(subset), len(axes)):
        axes[i].axis('off')
    
    plt.suptitle(f"Page {page+1}/{n_pages} - Images a verifier", fontsize=14)
    plt.tight_layout()
    plt.savefig(f"verification_page_{page+1}.png", dpi=150)
    plt.show()

In [None]:
# Creer les dossiers
CLEAN_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\cleaned_morphed_database_v2")
REJECTED_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\rejected_images_v2")

CLEAN_DIR.mkdir(exist_ok=True)
REJECTED_DIR.mkdir(exist_ok=True)

print("="*60)
print("NETTOYAGE AUTOMATIQUE")
print("="*60)

# Seuil plus strict: supprimer les images avec score > mean + 1*std
strict_threshold = df['quality_score'].mean() + 1.0 * df['quality_score'].std()

good_df = df[df['quality_score'] <= strict_threshold]
bad_df = df[df['quality_score'] > strict_threshold]

print(f"Seuil strict: {strict_threshold:.4f}")
print(f"   - Images a garder: {len(good_df)}")
print(f"   - Images a rejeter: {len(bad_df)}")

# Copier les bonnes images
copied = 0
for _, row in good_df.iterrows():
    filepath = row['filepath']
    img = cv2.imread(str(filepath))
    if img is not None:
        # Redimensionner a 128x128
        img = cv2.resize(img, (128, 128))
        new_path = CLEAN_DIR / filepath.name
        cv2.imwrite(str(new_path), img)
        copied += 1

# Copier les mauvaises images (pour reference)
rejected = 0
for _, row in bad_df.iterrows():
    filepath = row['filepath']
    if filepath.exists():
        shutil.copy(str(filepath), str(REJECTED_DIR / filepath.name))
        rejected += 1

print(f"\n[OK] Nettoyage termine!")
print(f"   - Images copiees: {copied}")
print(f"   - Images rejetees: {rejected}")
print(f"   - Dossier propre: {CLEAN_DIR}")
print(f"   - Dossier rejete: {REJECTED_DIR}")

In [None]:
# Verifier la data nettoyee
clean_files = list(CLEAN_DIR.glob("*.png")) + list(CLEAN_DIR.glob("*.jpg"))

# Compter par personne
clean_persons = defaultdict(list)
for f in clean_files:
    name = f.stem
    # Extraire le nom de la personne
    # Format attendu: NomPersonne_intra_N.png ou NomPersonne_N.png
    parts = name.split("_")
    if "intra" in parts:
        idx = parts.index("intra")
        person = "_".join(parts[:idx])
    else:
        person = "_".join(parts[:-1])
    
    clean_persons[person].append(f)

print("="*60)
print("DATA NETTOYEE - DISTRIBUTION")
print("="*60)

print(f"\nPersonnes: {len(clean_persons)}")
print(f"Total images: {len(clean_files)}")

print(f"\nDistribution par personne:")
for person, files in sorted(clean_persons.items(), key=lambda x: -len(x[1])):
    status = "OK" if len(files) >= 6 else "PEU"
    print(f"   {person}: {len(files)} images [{status}]")

# Filtrer les personnes avec assez d'images
MIN_IMAGES = 6
valid_persons = {k: v for k, v in clean_persons.items() if len(v) >= MIN_IMAGES}
print(f"\nPersonnes avec {MIN_IMAGES}+ images: {len(valid_persons)}")
print(f"Total images valides: {sum(len(v) for v in valid_persons.values())}")

In [None]:
import cv2
import numpy as np
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt

# Dossier source
SOURCE_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\2stage_morphed_database_HQ\stage2_inter_morphs")

image_files = list(SOURCE_DIR.glob("*.png")) + list(SOURCE_DIR.glob("*.jpg"))
print(f"Images source: {len(image_files)}")

# Analyser un echantillon pour trouver les bons seuils
sample_size = min(500, len(image_files))
sample_files = np.random.choice(image_files, sample_size, replace=False)

print(f"\nAnalyse de {sample_size} images...")

stats = {
    'brightness': [],
    'edge_ratio': [],
    'asymmetry': [],
    'laplacian_var': []
}

for filepath in sample_files:
    img = cv2.imread(str(filepath))
    if img is None:
        continue
    
    img = cv2.resize(img, (128, 128))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Brightness
    stats['brightness'].append(np.mean(gray))
    
    # Edge ratio
    edges = cv2.Canny(gray, 50, 150)
    stats['edge_ratio'].append(np.sum(edges > 0) / edges.size)
    
    # Asymmetry
    h, w = gray.shape
    left = gray[:, :w//2]
    right = cv2.flip(gray[:, w//2:], 1)
    min_w = min(left.shape[1], right.shape[1])
    stats['asymmetry'].append(np.mean(np.abs(left[:, :min_w].astype(float) - right[:, :min_w].astype(float))))
    
    # Laplacian variance (blur detection)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    stats['laplacian_var'].append(np.var(lap))

# Afficher les statistiques
print("\n" + "="*60)
print("STATISTIQUES DES IMAGES")
print("="*60)

for metric, values in stats.items():
    values = np.array(values)
    print(f"\n{metric}:")
    print(f"   Min: {values.min():.4f}")
    print(f"   Max: {values.max():.4f}")
    print(f"   Mean: {values.mean():.4f}")
    print(f"   Std: {values.std():.4f}")
    print(f"   Percentile 10%: {np.percentile(values, 10):.4f}")
    print(f"   Percentile 90%: {np.percentile(values, 90):.4f}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for i, (metric, values) in enumerate(stats.items()):
    row = i // 2
    col = i % 2
    axes[row][col].hist(values, bins=50, edgecolor='black', alpha=0.7)
    axes[row][col].set_title(f'Distribution: {metric}')
    axes[row][col].set_xlabel(metric)
    axes[row][col].set_ylabel('Frequence')
    axes[row][col].axvline(np.mean(values), color='red', linestyle='--', label=f'Mean: {np.mean(values):.3f}')
    axes[row][col].axvline(np.percentile(values, 95), color='green', linestyle='--', label=f'95%: {np.percentile(values, 95):.3f}')
    axes[row][col].legend()

plt.tight_layout()
plt.savefig("stats_distribution.png", dpi=150)
plt.show()

print("\n[INFO] Regardez les distributions pour choisir les seuils")

In [None]:
# Trier par edge_ratio pour voir ce qui est "normal" vs "distordu"
sample_with_stats = []

for filepath in sample_files[:200]:
    img = cv2.imread(str(filepath))
    if img is None:
        continue
    
    img = cv2.resize(img, (128, 128))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    edges = cv2.Canny(gray, 50, 150)
    edge_ratio = np.sum(edges > 0) / edges.size
    
    sample_with_stats.append((filepath, edge_ratio, img))

# Trier par edge_ratio
sample_with_stats.sort(key=lambda x: x[1])

# Afficher les images avec le PLUS BAS edge_ratio (probablement bonnes)
print("="*60)
print("IMAGES AVEC EDGE_RATIO BAS (bonnes)")
print("="*60)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for i in range(10):
    filepath, edge_ratio, img = sample_with_stats[i]
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[i].imshow(img_rgb)
    axes[i].set_title(f"edge={edge_ratio:.3f}", fontsize=9)
    axes[i].axis('off')

plt.suptitle("Edge Ratio BAS (probablement bonnes)", fontsize=14)
plt.tight_layout()
plt.show()

# Afficher les images avec le PLUS HAUT edge_ratio (probablement mauvaises)
print("\n" + "="*60)
print("IMAGES AVEC EDGE_RATIO HAUT (possiblement mauvaises)")
print("="*60)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for i in range(10):
    filepath, edge_ratio, img = sample_with_stats[-(i+1)]
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[i].imshow(img_rgb)
    axes[i].set_title(f"edge={edge_ratio:.3f}", fontsize=9)
    axes[i].axis('off')

plt.suptitle("Edge Ratio HAUT (possiblement mauvaises)", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
import shutil

# Dossier destination
CLEAN_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\final_clean_dataset")
CLEAN_DIR.mkdir(exist_ok=True)

# Supprimer les anciens fichiers
for f in CLEAN_DIR.glob("*"):
    f.unlink()

print("="*60)
print("NETTOYAGE AVEC SEUILS AJUSTES")
print("="*60)

# Calculer les seuils bases sur les percentiles
edge_threshold = np.percentile(stats['edge_ratio'], 90)  # Garder 90% des images
brightness_min = np.percentile(stats['brightness'], 5)   # Rejeter les 5% plus sombres
asymmetry_threshold = np.percentile(stats['asymmetry'], 95)  # Garder 95%

print(f"\nSeuils utilises:")
print(f"   - Edge ratio max: {edge_threshold:.4f}")
print(f"   - Brightness min: {brightness_min:.4f}")
print(f"   - Asymmetry max: {asymmetry_threshold:.4f}")

# Appliquer le nettoyage
good_images = []
bad_images = []

for filepath in image_files:
    img = cv2.imread(str(filepath))
    if img is None:
        bad_images.append((filepath, "corrupted"))
        continue
    
    img_resized = cv2.resize(img, (128, 128))
    gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
    
    # Calculer les metriques
    brightness = np.mean(gray)
    
    edges = cv2.Canny(gray, 50, 150)
    edge_ratio = np.sum(edges > 0) / edges.size
    
    h, w = gray.shape
    left = gray[:, :w//2]
    right = cv2.flip(gray[:, w//2:], 1)
    min_w = min(left.shape[1], right.shape[1])
    asymmetry = np.mean(np.abs(left[:, :min_w].astype(float) - right[:, :min_w].astype(float)))
    
    # Criteres de rejet (plus permissifs)
    if brightness < brightness_min:
        bad_images.append((filepath, "too_dark"))
    elif edge_ratio > edge_threshold:
        bad_images.append((filepath, "distorted"))
    elif asymmetry > asymmetry_threshold:
        bad_images.append((filepath, "asymmetric"))
    else:
        good_images.append(filepath)

print(f"\nResultats:")
print(f"   - Images OK: {len(good_images)}")
print(f"   - Images rejetees: {len(bad_images)}")

# Raisons de rejet
reasons = defaultdict(int)
for _, reason in bad_images:
    reasons[reason] += 1
print(f"\nRaisons de rejet:")
for reason, count in reasons.items():
    print(f"   - {reason}: {count}")

In [None]:
print("\n" + "="*60)
print("COPIE DES IMAGES VALIDES")
print("="*60)

# Organiser par personne
persons = defaultdict(list)

for filepath in good_images:
    name = filepath.stem
    
    # Extraire le nom de la personne
    if "_intra_" in name:
        person = name.split("_intra_")[0]
    elif "_inter_" in name:
        person = name.split("_inter_")[0]
    else:
        parts = name.rsplit("_", 1)
        person = parts[0] if len(parts) == 2 and parts[1].isdigit() else name
    
    persons[person].append(filepath)

print(f"Personnes detectees: {len(persons)}")

# Afficher la distribution
print("\nDistribution par personne:")
for person, files in sorted(persons.items(), key=lambda x: -len(x[1])):
    print(f"   {person}: {len(files)} images")

# Filtrer les personnes avec assez d'images
MIN_IMAGES = 6
valid_persons = {k: v for k, v in persons.items() if len(v) >= MIN_IMAGES}

print(f"\nPersonnes avec {MIN_IMAGES}+ images: {len(valid_persons)}")
print(f"Total images valides: {sum(len(v) for v in valid_persons.values())}")

In [None]:
# Copier les images
MAX_PER_PERSON = 30  # Maximum par personne pour equilibrer

copied = 0
for person, files in valid_persons.items():
    # Limiter le nombre d'images par personne
    selected = files[:MAX_PER_PERSON]
    
    for i, filepath in enumerate(selected, 1):
        img = cv2.imread(str(filepath))
        if img is not None:
            img = cv2.resize(img, (128, 128))
            new_filename = f"{person}_{i}.png"
            new_path = CLEAN_DIR / new_filename
            cv2.imwrite(str(new_path), img)
            copied += 1

print(f"\n[OK] {copied} images copiees dans {CLEAN_DIR}")

# Verifier
final_files = list(CLEAN_DIR.glob("*.png"))
print(f"[OK] Verification: {len(final_files)} fichiers")

# Compter par personne
final_persons = defaultdict(int)
for f in final_files:
    person = f.stem.rsplit("_", 1)[0]
    final_persons[person] += 1

print(f"\nDataset final:")
print(f"   - Personnes: {len(final_persons)}")
print(f"   - Images totales: {len(final_files)}")
print(f"   - Images par personne: {min(final_persons.values())} - {max(final_persons.values())}")

In [None]:
# Afficher des exemples
final_files = list(CLEAN_DIR.glob("*.png"))

if final_files:
    n_samples = min(20, len(final_files))
    sample_files = np.random.choice(final_files, n_samples, replace=False)
    
    rows = (n_samples + 4) // 5
    fig, axes = plt.subplots(rows, 5, figsize=(15, 3*rows))
    axes = axes.flatten()
    
    for i, filepath in enumerate(sample_files):
        img = cv2.imread(str(filepath))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[i].imshow(img)
        axes[i].set_title(filepath.stem[:15], fontsize=8)
        axes[i].axis('off')
    
    for i in range(n_samples, len(axes)):
        axes[i].axis('off')
    
    plt.suptitle(f"Dataset Final - {len(final_files)} images", fontsize=14)
    plt.tight_layout()
    plt.savefig(CLEAN_DIR / "preview.png", dpi=150)
    plt.show()
    
    print(f"\n[OK] Dataset pret dans: {CLEAN_DIR}")
    print(f"[OK] Vous pouvez maintenant lancer l'entrainement CNN!")

In [None]:
from pathlib import Path
import cv2
import numpy as np
from collections import defaultdict, Counter

# Dossier source
DATA_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\final_clean_dataset")

# Compter les images et classes
files = list(DATA_DIR.glob("*.png")) + list(DATA_DIR.glob("*.jpg"))

persons = defaultdict(int)
for f in files:
    name = f.stem
    # Extraire le nom de la personne
    if "_intra_" in name:
        person = name.split("_intra_")[0]
    elif "_inter_" in name:
        person = name.split("_inter_")[0]
    else:
        parts = name.rsplit("_", 1)
        person = parts[0] if len(parts) == 2 and parts[1].isdigit() else name
    persons[person] += 1

print("="*60)
print("DIAGNOSTIC DU DATASET")
print("="*60)
print(f"\nImages totales: {len(files)}")
print(f"Classes (personnes): {len(persons)}")
print(f"Images par classe: {min(persons.values())} - {max(persons.values())}")
print(f"Moyenne: {np.mean(list(persons.values())):.1f}")

# Distribution
print(f"\nDistribution:")
for person, count in sorted(persons.items(), key=lambda x: -x[1])[:20]:
    print(f"   {person}: {count}")

if len(persons) > 20:
    print(f"   ... et {len(persons) - 20} autres classes")

# Probleme?
if len(persons) > 50:
    print(f"\n[PROBLEME] Trop de classes ({len(persons)})!")
    print("   -> Le modele a du mal a apprendre")
    print("   -> Solution: Reduire le nombre de classes")

In [None]:
# Garder seulement les TOP N personnes avec le plus d'images
TOP_N = 14  # Comme le dataset Celebrity original

# Trier par nombre d'images
sorted_persons = sorted(persons.items(), key=lambda x: -x[1])
selected_persons = [p[0] for p in sorted_persons[:TOP_N]]

print("="*60)
print(f"SELECTION DES TOP {TOP_N} PERSONNES")
print("="*60)

for i, (person, count) in enumerate(sorted_persons[:TOP_N]):
    print(f"   {i+1}. {person}: {count} images")

# Creer le nouveau dossier
NEW_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\dataset_top14")
NEW_DIR.mkdir(exist_ok=True)

# Supprimer les anciens fichiers
for f in NEW_DIR.glob("*"):
    f.unlink()

# Copier les images des personnes selectionnees
copied = 0
for f in files:
    name = f.stem
    if "_intra_" in name:
        person = name.split("_intra_")[0]
    elif "_inter_" in name:
        person = name.split("_inter_")[0]
    else:
        parts = name.rsplit("_", 1)
        person = parts[0] if len(parts) == 2 and parts[1].isdigit() else name
    
    if person in selected_persons:
        img = cv2.imread(str(f))
        if img is not None:
            img = cv2.resize(img, (64, 64))
            # Nouveau nom
            idx = sum(1 for x in NEW_DIR.glob(f"{person}_*.png")) + 1
            new_path = NEW_DIR / f"{person}_{idx}.png"
            cv2.imwrite(str(new_path), img)
            copied += 1

print(f"\n[OK] {copied} images copiees dans {NEW_DIR}")

# Verifier
final_counts = defaultdict(int)
for f in NEW_DIR.glob("*.png"):
    person = f.stem.rsplit("_", 1)[0]
    final_counts[person] += 1

print(f"\nDataset final:")
for person, count in sorted(final_counts.items()):
    print(f"   {person}: {count} images")

In [None]:
# Garder seulement les TOP N personnes avec le plus d'images
TOP_N = 14  # Comme le dataset Celebrity original

# Trier par nombre d'images
sorted_persons = sorted(persons.items(), key=lambda x: -x[1])
selected_persons = [p[0] for p in sorted_persons[:TOP_N]]

print("="*60)
print(f"SELECTION DES TOP {TOP_N} PERSONNES")
print("="*60)

for i, (person, count) in enumerate(sorted_persons[:TOP_N]):
    print(f"   {i+1}. {person}: {count} images")

# Creer le nouveau dossier
NEW_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\dataset_top14")
NEW_DIR.mkdir(exist_ok=True)

# Supprimer les anciens fichiers
for f in NEW_DIR.glob("*"):
    f.unlink()

# Copier les images des personnes selectionnees
copied = 0
for f in files:
    name = f.stem
    if "_intra_" in name:
        person = name.split("_intra_")[0]
    elif "_inter_" in name:
        person = name.split("_inter_")[0]
    else:
        parts = name.rsplit("_", 1)
        person = parts[0] if len(parts) == 2 and parts[1].isdigit() else name
    
    if person in selected_persons:
        img = cv2.imread(str(f))
        if img is not None:
            img = cv2.resize(img, (64, 64))
            # Nouveau nom
            idx = sum(1 for x in NEW_DIR.glob(f"{person}_*.png")) + 1
            new_path = NEW_DIR / f"{person}_{idx}.png"
            cv2.imwrite(str(new_path), img)
            copied += 1

print(f"\n[OK] {copied} images copiees dans {NEW_DIR}")

# Verifier
final_counts = defaultdict(int)
for f in NEW_DIR.glob("*.png"):
    person = f.stem.rsplit("_", 1)[0]
    final_counts[person] += 1

print(f"\nDataset final:")
for person, count in sorted(final_counts.items()):
    print(f"   {person}: {count} images")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Charger le dataset
DATA_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\dataset_top14")
IMG_SIZE = 64

images = []
labels = []

for filepath in DATA_DIR.glob("*.png"):
    person_name = filepath.stem.rsplit("_", 1)[0]
    
    img = cv2.imread(str(filepath))
    if img is not None:
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        images.append(img)
        labels.append(person_name)

images = np.array(images, dtype=np.float32) / 255.0

unique_labels = sorted(list(set(labels)))
label_to_idx = {name: idx for idx, name in enumerate(unique_labels)}
idx_to_label = {idx: name for name, idx in label_to_idx.items()}

y = np.array([label_to_idx[name] for name in labels])
NUM_CLASSES = len(unique_labels)

print("="*60)
print("DATASET CHARGE")
print("="*60)
print(f"   - Images: {images.shape}")
print(f"   - Classes: {NUM_CLASSES}")

for idx, name in idx_to_label.items():
    count = np.sum(y == idx)
    print(f"   {idx}: {name} ({count} images)")

In [None]:
indices_by_class = defaultdict(list)
for idx, label in enumerate(y):
    indices_by_class[label].append(idx)

train_indices = []
test_indices = []

print("\n" + "="*60)
print("SPLIT 80% TRAIN / 20% TEST")
print("="*60)

for class_idx in sorted(indices_by_class.keys()):
    class_indices = indices_by_class[class_idx].copy()
    n_total = len(class_indices)
    n_train = int(0.8 * n_total)
    
    np.random.seed(42)
    np.random.shuffle(class_indices)
    
    train_indices.extend(class_indices[:n_train])
    test_indices.extend(class_indices[n_train:])
    
    print(f"   {idx_to_label[class_idx]}: {n_train} train / {n_total - n_train} test")

np.random.seed(42)
np.random.shuffle(train_indices)

X_train = images[train_indices]
y_train = y[train_indices]
X_test = images[test_indices]
y_test = y[test_indices]

y_train_cat = to_categorical(y_train, NUM_CLASSES)
y_test_cat = to_categorical(y_test, NUM_CLASSES)

print(f"\n[OK] Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")

In [None]:
# CNN SIMPLE
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("="*60)
print("MODELE CNN")
print("="*60)
print(f"   - Classes: {NUM_CLASSES}")
print(f"   - Parametres: {model.count_params():,}")
model.summary()

In [None]:
print("\n" + "="*60)
print("ENTRAINEMENT")
print("="*60)

EPOCHS = 50
BATCH_SIZE = 32

history = model.fit(
    X_train, y_train_cat,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test_cat),
    verbose=1
)

In [None]:
train_loss, train_acc = model.evaluate(X_train, y_train_cat, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)

print("\n" + "="*60)
print("RESULTATS")
print("="*60)
print(f"   - Train Accuracy: {train_acc*100:.2f}%")
print(f"   - Test Accuracy: {test_acc*100:.2f}%")
print(f"   - Overfitting: {(train_acc - test_acc)*100:.2f}%")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
train_loss, train_acc = model.evaluate(X_train, y_train_cat, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)

print("\n" + "="*60)
print("RESULTATS")
print("="*60)
print(f"   - Train Accuracy: {train_acc*100:.2f}%")
print(f"   - Test Accuracy: {test_acc*100:.2f}%")
print(f"   - Overfitting: {(train_acc - test_acc)*100:.2f}%")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
cm = confusion_matrix(y_test, y_pred_classes)

plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(NUM_CLASSES)
plt.xticks(tick_marks, [idx_to_label[i][:10] for i in range(NUM_CLASSES)], rotation=45, ha='right')
plt.yticks(tick_marks, [idx_to_label[i][:10] for i in range(NUM_CLASSES)])

thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

plt.ylabel('True')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

In [None]:
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60 + "\n")

target_names = [idx_to_label[i] for i in range(NUM_CLASSES)]
print(classification_report(y_test, y_pred_classes, target_names=target_names))

In [None]:
n_samples = 16
indices = np.random.choice(len(X_test), n_samples, replace=False)

fig, axes = plt.subplots(4, 4, figsize=(12, 12))
axes = axes.flatten()

correct = 0
for i, idx in enumerate(indices):
    img = X_test[idx]
    true_label = idx_to_label[y_test[idx]]
    pred_label = idx_to_label[y_pred_classes[idx]]
    conf = np.max(y_pred[idx]) * 100
    
    axes[i].imshow(img)
    
    if true_label == pred_label:
        color = 'green'
        title = f"OK: {pred_label[:10]}\n({conf:.0f}%)"
        correct += 1
    else:
        color = 'red'
        title = f"Pred: {pred_label[:8]}\nTrue: {true_label[:8]}"
    
    axes[i].set_title(title, fontsize=9, color=color)
    axes[i].axis('off')

plt.suptitle(f'Predictions: {correct}/{n_samples} correct ({correct/n_samples*100:.0f}%)', fontsize=14)
plt.tight_layout()
plt.savefig('predictions.png', dpi=150)
plt.show()

In [None]:
n_samples = 16
indices = np.random.choice(len(X_test), n_samples, replace=False)

fig, axes = plt.subplots(4, 4, figsize=(12, 12))
axes = axes.flatten()

correct = 0
for i, idx in enumerate(indices):
    img = X_test[idx]
    true_label = idx_to_label[y_test[idx]]
    pred_label = idx_to_label[y_pred_classes[idx]]
    conf = np.max(y_pred[idx]) * 100
    
    axes[i].imshow(img)
    
    if true_label == pred_label:
        color = 'green'
        title = f"OK: {pred_label[:10]}\n({conf:.0f}%)"
        correct += 1
    else:
        color = 'red'
        title = f"Pred: {pred_label[:8]}\nTrue: {true_label[:8]}"
    
    axes[i].set_title(title, fontsize=9, color=color)
    axes[i].axis('off')

plt.suptitle(f'Predictions: {correct}/{n_samples} correct ({correct/n_samples*100:.0f}%)', fontsize=14)
plt.tight_layout()
plt.savefig('predictions.png', dpi=150)
plt.show()

In [None]:
print("\n" + "="*70)
print("RESUME FINAL")
print("="*70)

print(f"""
DATASET:
   - Images: {len(images)}
   - Classes: {NUM_CLASSES}
   - Size: {IMG_SIZE}x{IMG_SIZE}

SPLIT:
   - Train: {len(X_train)} (80%)
   - Test: {len(X_test)} (20%)

MODEL:
   - Conv2D(32) -> Conv2D(64) -> Conv2D(128) -> Dense(512)
   - Parameters: {model.count_params():,}

RESULTS:
   - Train Accuracy: {train_acc*100:.2f}%
   - Test Accuracy: {test_acc*100:.2f}%
""")

if test_acc >= 0.80:
    print("EXCELLENT!")
elif test_acc >= 0.60:
    print("BON!")
elif test_acc >= 0.40:
    print("MOYEN - peut etre ameliore")
else:
    print("FAIBLE - verifiez les donnees")

print("="*70)

In [None]:
import numpy as np
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")

In [None]:
# Dossier source
SOURCE_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\final_clean_dataset")

# Si vide, utiliser le dossier original
if not list(SOURCE_DIR.glob("*.png")):
    SOURCE_DIR = Path(r"C:\Users\marwa\OneDrive\Desktop\moprh\2stage_morphed_database_HQ\stage2_inter_morphs")

files = list(SOURCE_DIR.glob("*.png")) + list(SOURCE_DIR.glob("*.jpg"))

print("="*60)
print("ANALYSE DU DATASET SOURCE")
print("="*60)
print(f"Dossier: {SOURCE_DIR}")
print(f"Images: {len(files)}")

# Compter par personne
persons = defaultdict(list)
for f in files:
    name = f.stem
    if "_intra_" in name:
        person = name.split("_intra_")[0]
    elif "_inter_" in name:
        person = name.split("_inter_")[0]
    else:
        parts = name.rsplit("_", 1)
        person = parts[0] if len(parts) == 2 and parts[1].isdigit() else name
    persons[person].append(f)

print(f"Personnes detectees: {len(persons)}")

# Afficher distribution
print("\nDistribution (top 20):")
for person, files_list in sorted(persons.items(), key=lambda x: -len(x[1]))[:20]:
    print(f"   {person}: {len(files_list)} images")