#### Necessary imports

In [2]:
# For necessary data processing and calculations
import numpy as np
import pandas as pd

# For reading and writing files
import json
import pickle
import sys
import os
import glob
import shutil
import io
from io import BytesIO
from pathlib import Path
import hashlib

# For image processing
from PIL import Image, UnidentifiedImageError
import face_recognition
import dlib
import cv2
from insightface.app import FaceAnalysis

# For face-embedding calculation
from keras_facenet import FaceNet as FN

# For machine learning
import tensorflow as tf

# For web scraping
import html
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse

# For tracking programming progress
from tqdm.notebook import tqdm
import time

  check_for_updates()


In [18]:
pd.set_option('max_colwidth', None)

# UK data collection for train/test image scraping
This notebook contains the scraping method that obtains image references from three different search engines

##### Set-up web driver

In [4]:
wd = webdriver.Firefox()
wd.quit()

### Define methods 
get_image_links_x: use the automated web browser to retrieve the image links that resulted from the search query <br>
download_image: go over the retrieved urls and download the images

In [11]:
def get_image_links_ddg(search_query, num_results, headless=True):
    """
    DuckDuckGo image scraper
    """
    ddg_search_url = f'https://duckduckgo.com/?q={search_query}&iax=images&ia=images'
    
    options = webdriver.SafariOptions()
    if headless:
        options.add_argument('--headless')
    
    driver = webdriver.Safari(options=options)
    
    try:
        print(f"Fetching DuckDuckGo results for: {search_query}")
        driver.get(ddg_search_url)
        time.sleep(2) 
        
        # Scroll
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        
        while scroll_attempts < 5:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            
            # Check if bottom is reached or enough image urls are obtained
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            
            # Check for images
            image_elements = driver.find_elements(By.CSS_SELECTOR, 'img[data-src], img[src*="external-content.duckduckgo.com"]')
            if len(image_elements) >= num_results:
                break
            scroll_attempts += 1
        
        # Find all image elements
        image_elements = driver.find_elements(By.CSS_SELECTOR, 'img[data-src], img[src*="external-content.duckduckgo.com"]')
        
        # Extract both src and data-src attr
        links = []
        for img in image_elements[:num_results]:
            src = img.get_attribute('src') or img.get_attribute('data-src')
            if src:
                if src.startswith('//'):
                    src = 'https:' + src
                links.append(src)
        
        politician_array = [search_query] * len(links)
        
        print(f"Found {len(links)} images for {search_query}")
        return politician_array, links

    except Exception as e:
        print(f"An error occurred while scraping DuckDuckGo for {search_query}: {str(e)}")
        return [search_query], []

    finally:
        driver.quit()

In [23]:
def get_image_links_bing(search_query, num_results, max_retries=3, headless=True):
    """
    Bing image scraper 
    """
    bing_search_url = f'https://www.bing.com/images/search?q={search_query.replace(" ", "+")}'

    options = webdriver.SafariOptions()
    if headless:
        options.add_argument('--headless')
    
    for attempt in range(max_retries):
        driver = None
        try:
            driver = webdriver.Safari(options=options)
            driver.get(bing_search_url)
            print(f"Attempt {attempt + 1}/{max_retries}: Fetching Bing results for '{search_query}'")
            
            time.sleep(2)
            
            # Scroll
            last_height = driver.execute_script("return document.body.scrollHeight")
            images_collected = 0
            scroll_attempts = 0
            
            while scroll_attempts < 5 and images_collected < num_results:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                
                # Check if bottom is reached or enough image urls are obtained
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
                
                # Find all image elements
                image_elements = driver.find_elements(
                    By.CSS_SELECTOR, 
                    'img.mimg:not([src^="data:"])'
                )
                images_collected = len(image_elements)
                scroll_attempts += 1
            
            # Extract image URLs
            image_elements = driver.find_elements(
                By.CSS_SELECTOR, 
                'img.mimg[src]:not([src^="data:"])'
            )
            
            links = []
            for img in image_elements[:num_results]:
                src = img.get_attribute('src')
                if src and not src.startswith('data:'):
                    # Cover for relative URLs
                    if src.startswith('/'):
                        src = f'https://www.bing.com{src}'
                    links.append(src)
            
            if not links:
                print(f"No image links found for '{search_query}' (attempt {attempt + 1})")
                if attempt < max_retries - 1:
                    time.sleep(5 * (attempt + 1))
                    continue
                return [search_query], []
            
            politician_array = [search_query] * len(links)
            print(f"Successfully collected {len(links)} images for '{search_query}'")
            return politician_array, links
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for '{search_query}': {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            
        finally:
            if driver:
                driver.quit()
    
    return [search_query], []

In [30]:
def get_image_links_ecosia(search_query, num_results, max_retries=3, headless=True):
    """
    Ecosia image scraper
    """
    ecosia_search_url = f'https://www.ecosia.org/images?q={search_query.replace(" ", "+")}'
    
    options = webdriver.SafariOptions()
    if headless:
        options.add_argument('--headless')
    
    for attempt in range(max_retries):
        driver = None
        try:
            driver = webdriver.Safari(options=options)
            print(f"Attempt {attempt + 1}/{max_retries}: Fetching Ecosia results for '{search_query}'")
            
            driver.set_page_load_timeout(30)
            driver.get(ecosia_search_url)
            
            # Wait
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".image-result__link-wrapper"))
            )
            
            # Scroll
            last_height = driver.execute_script("return document.body.scrollHeight")
            images_collected = 0
            scroll_attempts = 0
            
            while scroll_attempts < 5 and images_collected < num_results:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                
                # Check if bottom is reached
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
                
                # Count the number of images collected
                image_elements = driver.find_elements(By.CSS_SELECTOR, '.image-result__link-wrapper')
                images_collected = len(image_elements)
                scroll_attempts += 1
            
            # Extract both href and src attributes
            image_wrappers = driver.find_elements(
                By.CSS_SELECTOR, 
                '.image-result__link-wrapper'
            )
            
            links = []
            for wrapper in image_wrappers[:num_results]:
                try:
                    # First: href
                    link = wrapper.find_element(By.CSS_SELECTOR, 'a.image-result__link')
                    href = link.get_attribute('href')
                    
                    # Fallback: src
                    if not href:
                        img = wrapper.find_element(By.CSS_SELECTOR, 'img.image-result__image')
                        href = img.get_attribute('src')
                    
                    if href:
                        links.append(href)
                except Exception as e:
                    print(f"Warning: Couldn't extract link from one result: {str(e)}")
                    continue
            
            if not links:
                print(f"No image links found for '{search_query}' (attempt {attempt + 1})")
                if attempt < max_retries - 1:
                    time.sleep(5 * (attempt + 1))
                    continue
                return [search_query], []
            
            politician_array = [search_query] * len(links)
            print(f"Successfully collected {len(links)} images for '{search_query}'")
            return politician_array, links
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for '{search_query}': {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            
        finally:
            if driver:
                driver.quit()
    
    return [search_query], []

### Create database with image links
Create a list of politician names and set a limit on the number of image links to be extracted. <br>
For the UK, this list is based on: https://en.wikipedia.org/wiki/2024_United_Kingdom_general_election


In [9]:
uk_politicians_24 = [   # Party name / #seats
    'Keir Starmer',     # Labour / 411 
    'Rishi Sunak',      # Conservative / 121
    'Nigel Farage',     # Reform UK / 5
    'Ed Davey',         # Liberal Democrats / 72
    'Carla Denyer',     # Green Party of England and Wales / 4
    'Adrian Ramsay',    # Green Party of England and Wales / 4
    'John Swinney',     # Scottish National Party / 9
    'Mary Lou McDonald',# Sinn Féin / 7
    'George Galloway',  # Workers Party / 0 (new)
    'Rhun ap Iorwerth', # Plaid Cymru / 4
    'Gavin Robinson',   # Democratic Unionist / 5
    'Naomi Long',       # Alliance / 1
    'Doug Beattie',     # Ulster Unionist / 1
    'Patrick Harvie',   # Scottish Greens / 0
    'Lorna Slater',     # Scottish Greens / 0
    'Colum Eastwood',   # Social Democratic & Labour / 2
    'Jim Allister'      # Tradiitional Unionist Vote / 1
    ]

num_results = 125
image_link_list = []
politician_list = []

#### DuckDuckGo image links

In [None]:
for i, p in enumerate(uk_politicians_24):
    start_time = time.time()
    
    politician_array_ddg, image_links_ddg = get_image_links_ddg(p, num_results, headless=False)
    
    elapsed_time = time.time() - start_time
    minutes, seconds = divmod(elapsed_time, 60)

    print(f'Completed the extraction of {len(image_links_ddg)} for politician {p} in {int(minutes)} minutes and {seconds:.2f} seconds')
    image_link_list.extend(image_links_ddg)
    politician_list.extend(politician_array_ddg)

Fetching DuckDuckGo results for: Keir Starmer
Found 125 images for Keir Starmer
Completed the extraction of 125 for politician Keir Starmer in 0 minutes and 5.63 seconds
Fetching DuckDuckGo results for: Rishi Sunak
Found 125 images for Rishi Sunak
Completed the extraction of 125 for politician Rishi Sunak in 0 minutes and 5.56 seconds
Fetching DuckDuckGo results for: Nigel Farage
Found 125 images for Nigel Farage
Completed the extraction of 125 for politician Nigel Farage in 0 minutes and 5.63 seconds
Fetching DuckDuckGo results for: Ed Davey
Found 125 images for Ed Davey
Completed the extraction of 125 for politician Ed Davey in 0 minutes and 5.61 seconds
Fetching DuckDuckGo results for: Carla Denyer
Found 125 images for Carla Denyer
Completed the extraction of 125 for politician Carla Denyer in 0 minutes and 5.59 seconds
Fetching DuckDuckGo results for: Adrian Ramsay
Found 125 images for Adrian Ramsay
Completed the extraction of 125 for politician Adrian Ramsay in 0 minutes and 5.62 

In [13]:
print(len(politician_list), len(image_link_list))

2125 2125


In [35]:
df_ddg = pd.DataFrame({'politician': politician_list, 'img_link': image_link_list, 'engine': 'ddg'})
df_ddg['img_link'].tail()

2120    https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse4.mm.bing.net%2Fth%2Fid%2FOIP.tkI6RR-3JKkzhB7Gdp2O6AHaEM%3Fpid%3DApi&f=1&ipt=89abd46d271323ae4748d347037aab5e463efccbb51d33886b829c4dbff33128&ipo=images
2121    https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse2.mm.bing.net%2Fth%2Fid%2FOIP.vanofBJQhNrJfQLjVpUfvwHaFj%3Fpid%3DApi&f=1&ipt=bcb97ef62a9fca75fb1657ac6911bc3d9381724112c7b4b1333572d935cf11cb&ipo=images
2122    https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%2Fid%2FOIP.CbAua9lU7aVBZ-4CRIG5YAHaJ4%3Fpid%3DApi&f=1&ipt=e371835a42090f07f0c17c416a297010da6b30d16f64c7cf05587e68a8cea824&ipo=images
2123    https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse4.mm.bing.net%2Fth%2Fid%2FOIP.785cSPtZs2i8l9Lk9lWJiwHaFb%3Fpid%3DApi&f=1&ipt=b7bd8144e50a310298fe222e8ea397fb06fc3d966ce1c296b2cb48f37f1ff4d1&ipo=images
2124    https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse4.mm.bing.net%2Fth%2Fid%2FOIP.8oD

#### Bing image links

In [24]:
image_link_list_bing = []
politician_list_bing = []

In [None]:
for i, p in enumerate(uk_politicians_24):
    start_time = time.time()
    
    try: 
        politician_array_bing, image_links_bing = get_image_links_bing(
            p, 
            num_results, 
            headless=False
        )
        
        elapsed_time = time.time() - start_time
        minutes, seconds = divmod(elapsed_time, 60)

        if image_links_bing:
            print(f"Success: {len(image_links_bing)} images for {p} in {int(minutes)}m {seconds:.1f}s")
            image_link_list_bing.extend(image_links_bing)
            politician_list_bing.extend(politician_array_bing)
        else:
            print(f"Warning: No images found for {politician}")
            
    except Exception as e:
        print(f"Critical error processing {p}: {str(e)}")

    print(f'Completed the extraction of {len(image_links_bing)} for politician {p} in {int(minutes)} minutes and {seconds:.2f} seconds')

Attempt 1/3: Fetching Bing results for 'Keir Starmer'
Successfully collected 125 images for 'Keir Starmer'
Success: 125 images for Keir Starmer in 0m 10.4s
Completed the extraction of 125 for politician Keir Starmer in 0 minutes and 10.37 seconds
Attempt 1/3: Fetching Bing results for 'Rishi Sunak'
Successfully collected 125 images for 'Rishi Sunak'
Success: 125 images for Rishi Sunak in 0m 10.3s
Completed the extraction of 125 for politician Rishi Sunak in 0 minutes and 10.31 seconds
Attempt 1/3: Fetching Bing results for 'Nigel Farage'
Successfully collected 125 images for 'Nigel Farage'
Success: 125 images for Nigel Farage in 0m 10.5s
Completed the extraction of 125 for politician Nigel Farage in 0 minutes and 10.52 seconds
Attempt 1/3: Fetching Bing results for 'Ed Davey'
Successfully collected 125 images for 'Ed Davey'
Success: 125 images for Ed Davey in 0m 10.3s
Completed the extraction of 125 for politician Ed Davey in 0 minutes and 10.33 seconds
Attempt 1/3: Fetching Bing resul

In [26]:
print(len(image_link_list_bing), len(politician_list_bing))

2050 2050


In [36]:
df_bing = pd.DataFrame({'politician': politician_list_bing, 'img_link': image_link_list_bing, 'engine': 'bing'})
df_bing['img_link'].tail()

2045    https://th.bing.com/th/id/OIP.lJSznue74RkLp3ZtLAfGQgHaFc?w=233&h=180&c=7&r=0&o=7&dpr=2&pid=1.7&rm=3
2046    https://th.bing.com/th/id/OIP.0yua4KKGazN8zPJHTl-aGgHaEK?w=274&h=180&c=7&r=0&o=7&dpr=2&pid=1.7&rm=3
2047    https://th.bing.com/th/id/OIP.Tp_KgxyxCdr33NYoLR0cEgHaL4?w=115&h=180&c=7&r=0&o=7&dpr=2&pid=1.7&rm=3
2048    https://th.bing.com/th/id/OIP.wwBNFiaYvqM4r97exJbMrgHaF2?w=192&h=180&c=7&r=0&o=7&dpr=2&pid=1.7&rm=3
2049    https://th.bing.com/th/id/OIP.1rNWIlLtumm2FF87MWghOQHaEK?w=272&h=180&c=7&r=0&o=7&dpr=2&pid=1.7&rm=3
Name: img_link, dtype: object

#### Ecosia image links

In [31]:
image_link_list_ecosia = []
politician_list_ecosia = []

In [32]:
for i, p in enumerate(uk_politicians_24):
    start_time = time.time()
    
    try:
            politician_array_ecosia, image_links_ecosia = get_image_links_ecosia(
                p, 
                num_results,
                headless=False
            )
            
            elapsed = time.time() - start_time
            mins, secs = divmod(elapsed, 60)
            
            if image_links_ecosia:
                print(f"Success: {len(image_links_ecosia)} images for {p} in {int(minutes)}m {seconds:.1f}s")
                image_link_list_ecosia.extend(image_links_ecosia)
                politician_list_ecosia.extend(politician_array_ecosia)
            else:
                print(f"Warning: No images found for {p}")
                
    except Exception as e:
        print(f"Critical error processing {p}: {str(e)}")

Attempt 1/3: Fetching Ecosia results for 'Keir Starmer'
Successfully collected 125 images for 'Keir Starmer'
Success: 125 images for Keir Starmer in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'Rishi Sunak'
Successfully collected 125 images for 'Rishi Sunak'
Success: 125 images for Rishi Sunak in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'Nigel Farage'
Successfully collected 125 images for 'Nigel Farage'
Success: 125 images for Nigel Farage in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'Ed Davey'
Successfully collected 125 images for 'Ed Davey'
Success: 125 images for Ed Davey in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'Carla Denyer'
Successfully collected 125 images for 'Carla Denyer'
Success: 125 images for Carla Denyer in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'Adrian Ramsay'
Successfully collected 125 images for 'Adrian Ramsay'
Success: 125 images for Adrian Ramsay in 0m 10.4s
Attempt 1/3: Fetching Ecosia results for 'John Swinney'
Successf

In [33]:
print(len(image_link_list_ecosia), len(politician_list_ecosia))

2125 2125


In [37]:
df_ecosia = pd.DataFrame({'politician': politician_list_ecosia, 'img_link': image_link_list_ecosia, 'engine': 'ecosia'})
df_ecosia['img_link'].tail()

2120                                                https://www.irishnews.com/resizer/v2/BI2CBC5MBNA2TC5EZAOSG2NJSU.jpg?smart=true&auth=3e24a792c0a60599ab7b446f2d6674c793f89fe43d385b5075bcd42f70b52ebc&width=1200&height=630
2121                                                https://www.irishnews.com/resizer/v2/VYM2AWE3A5IJNBALYLQC63YOGM.jpg?smart=true&auth=7c13aba9e15507b0aee0f7bff3f94cc972dcd03a0d47f6c8b3bdc56a69fa68f0&width=1200&height=630
2122    https://media.gettyimages.com/id/832883764/photo/jim-allister-the-democratic-unionist-partys-sole-mep-announces-his-resignation-from-the-dup-at.jpg?s=612x612&w=gi&k=20&c=7NfUiSRRV3G27bFJWW9-b_J3PPxugYb7tc_nFabrlG0=
2123    https://media.gettyimages.com/id/832883828/photo/jim-allister-the-democratic-unionist-partys-sole-mep-announces-his-resignation-from-the-dup-at.jpg?s=612x612&w=gi&k=20&c=JTnFgtUsXoGuRohUBrzEZa-Yv8QgCVUx1oafZEicMKE=
2124                                                                                                        

### Combine DuckDuckGo, Bing and Ecosia dataframes

In [38]:
uk_imgref_df = pd.concat([df_ecosia, df_bing, df_ddg], axis=0, ignore_index=True)
uk_imgref_df = uk_imgref_df.sort_values(by=['politician', 'engine'])
uk_imgref_df = uk_imgref_df.reset_index(drop=True)

In [51]:
uk_imgref_df.to_pickle('datasets/train_test/UK/uk_searchengine_imgref_results.pkl')
uk_imgref_df.to_csv('datasets/train_test/UK/uk_searchengine_imgref_results.csv')

### Collect training data

In [45]:
app = FaceAnalysis(
    name='buffalo_l',
    providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
app.prepare(ctx_id=0, det_size=(640, 640))



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None

In [None]:
def generate_filename(url, p, face_index):
    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
    return f"{url_hash}_{p}_{face_index}.jpg"

def download_image(url, timeout=15):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, stream=True, timeout=timeout)
        response.raise_for_status()
        
        # Verify that the ref leads to an image
        if 'image' not in response.headers.get('Content-Type', ''):
            raise ValueError("Oops... no image found here!")
            
        image = np.asarray(bytearray(response.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        
        if image is None:
            raise ValueError("Failed to decode image")
            
        return image
    except Exception as e:
        print(f"Error downloading {url[:50]}...: {str(e)}")
        return None

def process_images(df, output_base):
    pbar = tqdm(df.iterrows(), total=len(df), desc="Processing training images")
    
    for idx, row in pbar:
        politician = row['politician']
        img_url = row['img_link']
     
        politician_safe = "".join(c if c.isalnum() else "_" for c in politician)
        politician_dir = os.path.join(output_base, politician_safe)
        os.makedirs(politician_dir, exist_ok=True)
        
        # Save image with retry
        img = None
        for attempt in range(3): # Attempts
            img = download_image(img_url)
            if img is not None:
                break
            time.sleep(1)
        
        if img is None:
            continue
        
        # Detect faces using the Insightface app (confidence threshold)
        faces = app.get(img)
        
        # Filter
        faces = [face for face in faces if face.det_score > 0.6]
        
        if not faces:
            continue
        
        # Process faces
        for i, face in enumerate(faces):
            # Bounding box
            bbox = face.bbox.astype(int)
            x1, y1, x2, y2 = bbox
            
            # Dynamic padding based on face size
            padding_factor = min(0.3, 100/max(x2-x1, y2-y1))  # Slightly larger pad for small faces
            h, w = y2 - y1, x2 - x1
            x1 = max(0, x1 - int(padding_factor * w))
            y1 = max(0, y1 - int(padding_factor * h))
            x2 = min(img.shape[1], x2 + int(padding_factor * w))
            y2 = min(img.shape[0], y2 + int(padding_factor * h))
            
            # Crop 
            face_crop = img[y1:y2, x1:x2]
            if face_crop.size == 0:
                continue
                
            # Generate filename and save
            face_filename = generate_filename(img_url, politician, i)
            save_path = os.path.join(politician_dir, face_filename)
            
            try:
                # Save
                cv2.imwrite(save_path, face_crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
            except Exception as e:
                print(f"Error saving {save_path}: {str(e)}")

NOTE: this function is run only once. After it ran, and the facial images are collected, a manual correction is applied. With drag-and-drop, the image samples are put in the folders that contain the right label (here: politician name). <br>
DO NOT RUN THIS FUNCTION; the faces are already collected and corrected!

In [1]:
#process_images(, )