In [1]:
import cv2
import numpy as np
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq

from ast import literal_eval
from typing import List, Tuple
from deepface import DeepFace
from tqdm import tqdm
import sqlite3
import time

## Compute embeddings

Train/test set

In [2]:
classification_path = '../b__data-collection-with-web-scraping/datasets/news/UK-NL__news_faces'
countries = ['NL', 'UK']

In [3]:
# For Deepface model choice
embedding_model = "VGG-Face"
embedding_model2 = "Facenet512"
embedding_model3 = "SFace"

In [9]:
img_path = '../b__data-collection-with-web-scraping/datasets/train_test/NL/Caroline_van_der_Plas/0d1e1220_Caroline van der Plas_0.jpg'
dummy_img = cv2.imread(img_path)
if dummy_img is None:
    dummy_img = (255 * np.random.rand(224, 224, 3)).astype(np.uint8)
    cv2.imwrite(img_path, dummy_img)

embedding_objs = DeepFace.represent(img_path, model_name="Facenet512")
embedding_vector = embedding_objs[0]['embedding']
print(f"The dimensionality of the {embedding_model2} embedding vector is: {len(embedding_vector)}")

The dimensionality of the Facenet512 embedding vector is: 512


In [None]:
#def simple_convert(x):
    """
    Ensures all embeddings are numpy arrays in their correct dimensionality
    """
    #return np.array(literal_eval(str(x).strip("[]")))

In [4]:
def compute_face_embeddings(base_path, output_path, country, model):
    """
    Computes face embeddings for a country-specific dataset, optimized for performance and correctness.
    """
    input_path = os.path.join(base_path, country)
    output_file = os.path.join(output_path, f"{country}_{model}_news_embeddings.parquet")
    
    results_list = []
    
    outlets = [out for out in os.listdir(input_path) 
                 if os.path.isdir(os.path.join(input_path, out))]

    for outlet in outlets:
        outlet_path = os.path.join(input_path, outlet)
        if not os.path.isdir(outlet_path):
            print(f"Could not find a path at {outlet_path}...")
            continue
        
        for img_file in tqdm(os.listdir(outlet_path), desc=f"Processing {country}, outlet: {outlet}"):
            if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                continue
            
            img_path = os.path.join(outlet_path, img_file)

            try:
                embedding = DeepFace.represent(
                    img_path=img_path, 
                    model_name=model,
                    enforce_detection=False, 
                    detector_backend="opencv"
                )

                if embedding and len(embedding) > 0:
                    embedding = embedding[0]["embedding"]
                else:
                    embedding = None

            except Exception as e:
                print(f"Error processing {img_path}: {e}")
                embedding = None

            results_list.append({
                'country': country,
                'outlet': outlet,
                'image_path': img_path,
                f'embedding_{model}': embedding
            })
    
    if not results_list:
        print("No images were processed. No output file will be created.")
        return None
    
    df = pd.DataFrame(results_list)
    
    print(f"Processed {len(df)} images.")
    df.to_parquet(output_file)

    print(f".. & saved the embeddings to {output_file}!!")
    return df

### NL

In [7]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="NL",
    model=embedding_model
)

Processing NL, outlet: NU: 100%|██████████| 10573/10573 [18:07<00:00,  9.72it/s]
Processing NL, outlet: NOS: 100%|██████████| 13345/13345 [24:16<00:00,  9.16it/s]


Processed 23916 images.
Saved embeddings to datasets/NL_VGG-Face_news_embeddings.csv and datasets/NL_VGG-Face_news_embeddings.pkl


Unnamed: 0,country,outlet,image_path,embedding_VGG-Face
0,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0032962336907719847, 0.0, 0...."
...,...,...,...,...
23911,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02506085..."
23912,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
23913,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.00400828309998943,..."
23914,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [11]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="NL",
    model=embedding_model2
)

Processing NL, outlet: NU: 100%|██████████| 10573/10573 [14:48<00:00, 11.91it/s]
Processing NL, outlet: NOS: 100%|██████████| 13345/13345 [18:50<00:00, 11.80it/s]


Processed 23916 images.
.. & saved the embeddings to datasets/NL_Facenet512_news_embeddings.parquet!!


Unnamed: 0,country,outlet,image_path,embedding_Facenet512
0,NL,NU,../b__data-collection-with-web-scraping/datase...,"[1.0682164430618286, -0.748464822769165, -0.79..."
1,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.08321316540241241, -0.44820237159729004, -0..."
2,NL,NU,../b__data-collection-with-web-scraping/datase...,"[1.8530863523483276, 0.6035377383232117, 1.351..."
3,NL,NU,../b__data-collection-with-web-scraping/datase...,"[-0.3412712812423706, -0.12857837975025177, -0..."
4,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.41409143805503845, -0.16773490607738495, -0..."
...,...,...,...,...
23911,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.41583675146102905, -0.6303825378417969, 2.4..."
23912,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.43812406063079834, -0.3838798701763153, -0...."
23913,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.46805739402770996, 0.07853040099143982, -0...."
23914,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.6519553661346436, -0.6085307598114014, -0.1..."


In [5]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="NL",
    model=embedding_model3
)

Processing NL, outlet: NU: 100%|██████████| 10573/10573 [02:51<00:00, 61.80it/s]
Processing NL, outlet: NOS: 100%|██████████| 13345/13345 [03:58<00:00, 56.02it/s]


Processed 23916 images.
.. & saved the embeddings to datasets/NL_SFace_news_embeddings.parquet!!


Unnamed: 0,country,outlet,image_path,embedding_SFace
0,NL,NU,../b__data-collection-with-web-scraping/datase...,"[-0.18348920345306396, 0.48804083466529846, 0...."
1,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.8692858815193176, -0.4954947829246521, 0.64..."
2,NL,NU,../b__data-collection-with-web-scraping/datase...,"[-1.5757060050964355, 0.5245978236198425, -1.2..."
3,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.41594991087913513, 0.8120556473731995, 0.28..."
4,NL,NU,../b__data-collection-with-web-scraping/datase...,"[0.3604530394077301, -0.11650972068309784, -0...."
...,...,...,...,...
23911,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.4304094612598419, 0.36411628127098083, -0.3..."
23912,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.8647322058677673, -0.06492351740598679, 0.2..."
23913,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.11627650260925293, 0.6066292524337769, -0.0..."
23914,NL,NOS,../b__data-collection-with-web-scraping/datase...,"[0.6450523734092712, -0.2024608701467514, -0.0..."


### UK

In [6]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="UK",
    model=embedding_model
)

Processing UK, outlet: BBC: 100%|██████████| 58459/58459 [1:33:53<00:00, 10.38it/s]  
Processing UK, outlet: The Guardian: 100%|██████████| 66477/66477 [1:46:35<00:00, 10.39it/s]  


Processed 124934 images.
.. & saved the embeddings to datasets/UK_VGG-Face_news_embeddings.parquet!!


Unnamed: 0,country,outlet,image_path,embedding_VGG-Face
0,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
124929,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
124930,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
124931,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.02381529704791354, 0.0, 0.0, 0.0, 0.0,..."
124932,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="UK",
    model=embedding_model2
)

Processing UK, outlet: BBC: 100%|██████████| 58459/58459 [1:21:56<00:00, 11.89it/s]
Processing UK, outlet: The Guardian: 100%|██████████| 66477/66477 [1:28:27<00:00, 12.52it/s]


Processed 124934 images.
.. & saved the embeddings to datasets/UK_Facenet512_news_embeddings.parquet!!


Unnamed: 0,country,outlet,image_path,embedding_Facenet512
0,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[-0.8497318029403687, 0.5972830653190613, 1.64..."
1,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[-0.08039365708827972, -0.31670698523521423, -..."
2,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[2.0386507511138916, 0.6402831673622131, -0.60..."
3,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.4088406562805176, 0.2692851722240448, -1.24..."
4,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.6954588890075684, -0.7732324004173279, -0.3..."
...,...,...,...,...
124929,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-0.3406186103820801, -0.24740342795848846, -0..."
124930,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.27545300126075745, -0.11634090542793274, -0..."
124931,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-1.2595386505126953, -0.08434528112411499, -0..."
124932,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-0.32622385025024414, -0.9246070981025696, -0..."


In [6]:
compute_face_embeddings(
    base_path=classification_path,
    output_path='datasets',
    country="UK",
    model=embedding_model3
)

Processing UK, outlet: BBC: 100%|██████████| 58459/58459 [14:40<00:00, 66.37it/s]
Processing UK, outlet: The Guardian: 100%|██████████| 66477/66477 [15:35<00:00, 71.04it/s]


Processed 124934 images.
.. & saved the embeddings to datasets/UK_SFace_news_embeddings.parquet!!


Unnamed: 0,country,outlet,image_path,embedding_SFace
0,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.2584936022758484, 0.34092748165130615, 0.90..."
1,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.10981295257806778, -0.7946093678474426, 0.5..."
2,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.6044485569000244, 0.8303433060646057, -0.30..."
3,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[0.5933582186698914, 1.303056001663208, 1.2680..."
4,UK,BBC,../b__data-collection-with-web-scraping/datase...,"[1.4054539203643799, -0.21764147281646729, 0.8..."
...,...,...,...,...
124929,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[0.6175006628036499, -0.9750760197639465, -0.0..."
124930,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-1.2949422597885132, 0.6245490312576294, -0.0..."
124931,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-0.4016883969306946, -0.683716893196106, 0.75..."
124932,UK,The Guardian,../b__data-collection-with-web-scraping/datase...,"[-0.5594007968902588, 0.03218391537666321, 0.4..."
