In [5]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

import os
import shutil
from pathlib import Path

### Set VGG-Face configs

In [6]:
# Country configs (VGG-Face)
configs = pd.read_parquet('../c__embeddings-model-development/datasets/NL-UK__train_dev_test_results_global_and_class_t.parquet')
country_configs = configs[(configs['embedder'] == 'VGG-Face') & (configs['split'] == 'test') & (configs['threshold'] == 'class')]

COUNTRY_CONFIGS = {
    'NL': {
        'VGG-Face': {
            'C': 1,
            'gamma': 'scale',
            'kernel': 'rbf'
        }
    },
    'UK': {
        'VGG-Face': {
            'C': 1,
            'gamma': 'scale',
            'kernel': 'rbf'
        }
    }
}

# Per-class thresholds for both countries (VGG-Face)
COUNTRY_THRESHOLDS = pd.read_pickle('../c__embeddings-model-development/datasets/UK-NL_VGG-Face_country_class_thresholds.pkl')

In [7]:
COUNTRY_THRESHOLDS['UK']['VGG-Face']

{np.int64(0): np.float64(0.525),
 np.int64(1): np.float64(0.7750000000000002),
 np.int64(2): np.float64(0.6250000000000001),
 np.int64(3): np.float64(0.6750000000000002),
 np.int64(4): np.float64(0.5750000000000001),
 np.int64(5): np.float64(0.525),
 np.int64(6): np.float64(0.5750000000000001),
 np.int64(7): np.float64(0.7500000000000002),
 np.int64(8): np.float64(0.525),
 np.int64(9): np.float64(0.5750000000000001),
 np.int64(10): np.float64(0.55),
 np.int64(11): np.float64(0.5),
 np.int64(12): np.float64(0.6000000000000001),
 np.int64(13): np.float64(0.525),
 np.int64(14): np.float64(0.7000000000000002),
 np.int64(15): np.float64(0.5),
 np.int64(16): np.float64(0.6000000000000001)}

### Set data structure with train/test embeddings

In [8]:
# Countries of interest for this project
COUNTRIES = ['NL', 'UK']

# Set data structure to store data and facilitate analysis           
COUNTRY_DATA = {
    'NL': {
        'embeddings': None,
        'Y': None,
        'label_encoder': None,
        'politician': None,
        'embedders': {}  
    },
    'UK': {
        'embeddings': None,
        'Y': None,
        'label_encoder': None,
        'politician': None,
        'embedders': {}
    }
}

# Embedders to test
EMBEDDING_MODELS = [
    'VGG-Face'
]

# DF columns
EMBEDDING_COLS = ['embedding_' + embedder for embedder in EMBEDDING_MODELS]

# UNKNOWN_LABEL
UNKNOWN_LABEL = 99
POSSIBLE_LABEL = 100

In [9]:
# Load and process data for each country
def create_data_structure(countries=COUNTRIES, country_data=COUNTRY_DATA, embedding_models=EMBEDDING_MODELS):
    for country in countries:
        # Load data
        embeddings = pd.read_parquet(f'../b__data-collection-with-web-scraping/datasets/train_test/{country}_embeddings.parquet')
        
        # Convert to numpy arrays
        for col in EMBEDDING_COLS:
            # Filter out rows where the embedding is not a list or has an invalid dimension
            embeddings = embeddings[embeddings[col].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 1)]
            # Numpy mapping
            embeddings[col] = embeddings[col].map(lambda x: np.array(x))

        # Encode labels
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(embeddings['politician'])
        embeddings['politician_encoded'] = y
        
        # Store in dictionary
        country_data[country]['embeddings'] = embeddings
        country_data[country]['Y'] = y
        country_data[country]['label_encoder'] = label_encoder
        country_data[country]['politician'] = label_encoder.classes_
        
        # Extract and store each embedding type with names
        for model in embedding_models:
            col_name = f'embedding_{model}'
            country_data[country]['embedders'][model] = {
                'train': {
                    'X': embeddings[col_name],
                    'y': embeddings['politician_encoded'].tolist(),
                    'politician': embeddings['politician'].tolist()
            }
        }
    return country_data

In [10]:
country_data = create_data_structure()

In [11]:
country_data['UK']['embedders']['VGG-Face']['train']['politician'][:5]

['Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay']

### Classify news embeddings

In [12]:
def copy_classified_images_flex(country, embedding_model, classified_embedding_dir, image_dir):
    # Paths
    Path(classified_embedding_dir).mkdir(exist_ok=True)
    Path(image_dir).mkdir(exist_ok=True)
    country_embedding_path = os.path.join(classified_embedding_dir, f'{country}_{embedding_model}_classified_news_embeddings.parquet')
    model_image_dir = os.path.join(image_dir, embedding_model)
    Path(model_image_dir).mkdir(exist_ok=True)
    model_country_dir = os.path.join(model_image_dir, country)
    Path(model_image_dir).mkdir(exist_ok=True)

    # Read data
    classified_embeddings = pd.read_parquet(country_embedding_path)
    
    # Filter out UNKNOWN, create folders
    for label_name, group in classified_embeddings[classified_embeddings['final_label_name'] != 'UNKNOWN'].groupby('final_label_name'):
        safe_label_name = "".join(c for c in label_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
        label_dir = os.path.join(model_country_dir, safe_label_name)
        
        # Create directory
        Path(label_dir).mkdir(parents=True, exist_ok=True)
        
        # Copy files
        for src_path in group['image_path']:
            if not os.path.exists(src_path):
                print(f"Source file missing: {src_path}")
                continue
                
            dest_path = os.path.join(label_dir, os.path.basename(src_path))
            
            try:
                shutil.copy2(src_path, dest_path)
            except Exception as e:
                print(f"Failed to copy {src_path}: {str(e)}")


In [13]:
def classify_news_embeddings(country, country_data, embedder, params, class_thresholds, UNKNOWN_LABEL, POSSIBLE_LABEL, tolerance=0.10):

    # --------------------------------------------------
    # Load data
    # --------------------------------------------------
    news_embeddings = pd.read_parquet(
        f'../c__embeddings-model-development/datasets/{country}_{embedder}_news_embeddings.parquet'
    )

    image_identifiers = news_embeddings['image_path'].str.extract(
            r'.*/(?:UK|NL)/(?:[^/]+/)?'     # Match country and optional subfolder
            r'(?P<outlet>[^_]+)'            # Outlet (now handles spaces)
            r'_(?P<id>[^_]+)'               # Article ID (alphanumeric)
            r'_(?P<id_unique>[^_]+)'        # Face ID (alphanumeric)
            r'_(?P<face_i>\d+)'             # Instance number (digits only)
            r'\.jpg$'
        )
    image_identifiers['outlet'] = image_identifiers['outlet'].str.replace(' ', '')

    X_train = np.vstack(country_data[country]['embedders'][embedder]['train']['X'])
    y_train = np.array(country_data[country]['embedders'][embedder]['train']['y'])
    X_news = np.vstack(news_embeddings[f'embedding_{embedder}'].values)

    # --------------------------------------------------
    # Scaling
    # --------------------------------------------------
    scaler = Normalizer(norm='l2')
    X_train = scaler.fit_transform(X_train)
    X_news = scaler.transform(X_news)

    # --------------------------------------------------
    # Set params
    # --------------------------------------------------
    params = params[country][embedder]
    base_model = SVC(**params, probability=True, random_state=42)
    base_model.fit(X_train, y_train)

    # --------------------------------------------------
    # Predict
    # --------------------------------------------------
    probas = base_model.predict_proba(X_news)      
    classes = base_model.classes_                    
    best_idx = probas.argmax(axis=1)                        
    max_probas = probas[np.arange(probas.shape[0]), best_idx]

    sorted_probas = np.sort(probas, axis=1)
    margins = sorted_probas[:, -1] - sorted_probas[:, -2] if probas.shape[1] >= 2 else np.ones(len(max_probas))

    predicted_classes = classes[best_idx]  

    # --------------------------------------------------
    # Fetch per-class thresholds
    # --------------------------------------------------
    embedder_thresholds = class_thresholds[country][embedder]
    per_class_thresholds = {}
    for k, v in embedder_thresholds.items():
        key = int(k) if hasattr(k, 'item') else int(k)
        value = float(v) if hasattr(v, 'item') else float(v)
        per_class_thresholds[key] = value

    thresholds = np.array([per_class_thresholds.get(int(c)) for c in predicted_classes], dtype=float)

    # --------------------------------------------------
    # Mask Politicians versus Unknowns
    # --------------------------------------------------
    pass_threshold = (max_probas >= thresholds)
    is_politician = pass_threshold

    # --------------------------------------------------
    # Assign Unknown, Possible labels
    # --------------------------------------------------
    is_possible = (~is_politician) & (max_probas >= (thresholds - float(tolerance)))
    is_unknown = ~(is_politician | is_possible)

    # --------------------------------------------------
    # Final labeling
    # --------------------------------------------------
    y_pred = predicted_classes.astype(int).copy()
    y_pred[is_possible] = POSSIBLE_LABEL
    y_pred[is_unknown] = UNKNOWN_LABEL

    # --------------------------------------------------
    # Label mapping
    # --------------------------------------------------
    le = country_data[country]['label_encoder']
    id_to_name = {int(i): n for i, n in enumerate(le.classes_)}
    label_mapping = {**id_to_name, UNKNOWN_LABEL: "UNKNOWN", POSSIBLE_LABEL: "POSSIBLE"}

    # --------------------------------------------------
    # DF output
    # --------------------------------------------------
    classified = news_embeddings[['image_path']].copy().join(image_identifiers)
    classified['max_proba'] = max_probas
    classified['margin'] = margins
    classified['predicted_class'] = predicted_classes
    classified['final_label'] = y_pred
    classified['predicted_class_name'] = classified['predicted_class'].map(id_to_name)
    classified['final_label_name'] = classified['final_label'].map(label_mapping)
    classified['class_threshold'] = thresholds
    classified['is_politician'] = is_politician
    classified['is_possible'] = is_possible
    classified['is_unknown'] = is_unknown

    # --------------------------------------------------
    # Summary report
    # --------------------------------------------------
    print(f"\n[{country} | {embedder}] summary:")
    print("Politician:", int(is_politician.sum()))
    print("Possible:  ", int(is_possible.sum()))
    print("Unknown:   ", int(is_unknown.sum()))
    print("Top-1 prob (median/mean):", np.median(max_probas).round(3), "/", np.mean(max_probas).round(3))
    print("Top-1-Top-2 margin (median/mean):", np.median(margins).round(3), "/", np.mean(margins).round(3))
    
    print(f"Threshold range: {thresholds.min():.3f} - {thresholds.max():.3f}")
    print(f"Mean threshold: {thresholds.mean():.3f}")

    accepted = pd.Series(predicted_classes[is_politician]).value_counts().sort_index()
    if not accepted.empty:
        print("\nAccepted per class (id: count):")
        for class_id, count in accepted.items():
            threshold_val = per_class_thresholds.get(int(class_id))
            print(f"  Class {class_id}: {count} (threshold: {threshold_val:.3f})")

    return classified

In [14]:
classified_embedding_dir = "datasets/classified_embeddings"
Path(classified_embedding_dir).mkdir(exist_ok=True)

image_dir = "datasets/news_images_classified"
Path(image_dir).mkdir(exist_ok=True)

In [15]:
nl_news_classifications_VGG = classify_news_embeddings(
    country='NL', 
    country_data=COUNTRY_DATA, 
    embedder='VGG-Face', 
    params=COUNTRY_CONFIGS,
    class_thresholds=COUNTRY_THRESHOLDS, 
    UNKNOWN_LABEL=99, 
    POSSIBLE_LABEL=100)
nl_news_classifications_VGG.to_parquet('datasets/classified_embeddings/NL_VGG-Face_classified_news_embeddings.parquet')

uk_news_classifications_VGG = classify_news_embeddings(
    country='UK', 
    country_data=COUNTRY_DATA, 
    embedder='VGG-Face', 
    params=COUNTRY_CONFIGS,
    class_thresholds=COUNTRY_THRESHOLDS, 
    UNKNOWN_LABEL=99, 
    POSSIBLE_LABEL=100)
uk_news_classifications_VGG.to_parquet('datasets/classified_embeddings/UK_VGG-Face_classified_news_embeddings.parquet')


[NL | VGG-Face] summary:
Politician: 1437
Possible:   1085
Unknown:    21394
Top-1 prob (median/mean): 0.321 / 0.369
Top-1-Top-2 margin (median/mean): 0.139 / 0.213
Threshold range: 0.500 - 0.900
Mean threshold: 0.729

Accepted per class (id: count):
  Class 0: 49 (threshold: 0.550)
  Class 1: 14 (threshold: 0.775)
  Class 2: 63 (threshold: 0.675)
  Class 3: 250 (threshold: 0.600)
  Class 4: 38 (threshold: 0.900)
  Class 5: 114 (threshold: 0.600)
  Class 6: 212 (threshold: 0.550)
  Class 7: 182 (threshold: 0.650)
  Class 8: 63 (threshold: 0.800)
  Class 9: 42 (threshold: 0.750)
  Class 10: 31 (threshold: 0.500)
  Class 11: 42 (threshold: 0.775)
  Class 12: 37 (threshold: 0.900)
  Class 13: 42 (threshold: 0.650)
  Class 14: 117 (threshold: 0.775)
  Class 15: 59 (threshold: 0.650)
  Class 16: 82 (threshold: 0.700)

[UK | VGG-Face] summary:
Politician: 8498
Possible:   9410
Unknown:    107026
Top-1 prob (median/mean): 0.296 / 0.338
Top-1-Top-2 margin (median/mean): 0.121 / 0.186
Threshol

In [2]:
nl_classes_encoded = {0: 'Caroline_van_der_Plas', 1: 'Chris_Stoffer', 2: 'Dilan_Yesilgoz', 3: 'Edson_Olf',
 4: 'Esther_Ouwehand', 5: 'Frans_Timmermans', 6: 'Geert_Wilders', 7: 'Henri_Bontenbal',
 8: 'Joost_Eerdmans', 9: 'Laurens_Dassen', 10: 'Lilian_Marijnissen', 11: 'Mirjam_Bikker',
 12: 'Pieter_Omtzigt', 13: 'Rob_Jetten', 14: 'Stephan_van_Baarle', 15: 'Thierry_Baudet',
 16: 'Wybren_van_Haga'}


In [17]:
'''
['Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay',
 'Adrian_Ramsay']

[NL | VGG-Face] summary:
Politician: 1437
Possible:   1085
Unknown:    21394
Top-1 prob (median/mean): 0.321 / 0.369
Top-1-Top-2 margin (median/mean): 0.139 / 0.213
Threshold range: 0.500 - 0.900
Mean threshold: 0.729

Accepted per class (id: count):
  Class 0: 49 (threshold: 0.550)
  Class 1: 14 (threshold: 0.775)
  Class 2: 63 (threshold: 0.675)
  Class 3: 250 (threshold: 0.600)
  Class 4: 38 (threshold: 0.900)
  Class 5: 114 (threshold: 0.600)
  Class 6: 212 (threshold: 0.550)
  Class 7: 182 (threshold: 0.650)
  Class 8: 63 (threshold: 0.800)
  Class 9: 42 (threshold: 0.750)
  Class 10: 31 (threshold: 0.500)
  Class 11: 42 (threshold: 0.775)
  Class 12: 37 (threshold: 0.900)
  Class 13: 42 (threshold: 0.650)
  Class 14: 117 (threshold: 0.775)
  Class 15: 59 (threshold: 0.650)
  Class 16: 82 (threshold: 0.700)

[UK | VGG-Face] summary:
Politician: 8498
Possible:   9410
Unknown:    107026
Top-1 prob (median/mean): 0.296 / 0.338
Top-1-Top-2 margin (median/mean): 0.121 / 0.186
Threshold range: 0.500 - 0.775
Mean threshold: 0.621

Accepted per class (id: count):
  Class 0: 396 (threshold: 0.525)
  Class 1: 651 (threshold: 0.775)
  Class 2: 220 (threshold: 0.625)
  Class 3: 59 (threshold: 0.675)
  Class 4: 373 (threshold: 0.575)
  Class 5: 130 (threshold: 0.525)
  Class 6: 114 (threshold: 0.575)
  Class 7: 381 (threshold: 0.750)
  Class 8: 307 (threshold: 0.525)
  Class 9: 492 (threshold: 0.575)
  Class 10: 183 (threshold: 0.550)
  Class 11: 288 (threshold: 0.500)
  Class 12: 105 (threshold: 0.600)
  Class 13: 431 (threshold: 0.525)
  Class 14: 114 (threshold: 0.700)
  Class 15: 222 (threshold: 0.500)
  Class 16: 4032 (threshold: 0.600)
'''

"\n['Adrian_Ramsay',\n 'Adrian_Ramsay',\n 'Adrian_Ramsay',\n 'Adrian_Ramsay',\n 'Adrian_Ramsay']\n\n[NL | VGG-Face] summary:\nPolitician: 1437\nPossible:   1085\nUnknown:    21394\nTop-1 prob (median/mean): 0.321 / 0.369\nTop-1-Top-2 margin (median/mean): 0.139 / 0.213\nThreshold range: 0.500 - 0.900\nMean threshold: 0.729\n\nAccepted per class (id: count):\n  Class 0: 49 (threshold: 0.550)\n  Class 1: 14 (threshold: 0.775)\n  Class 2: 63 (threshold: 0.675)\n  Class 3: 250 (threshold: 0.600)\n  Class 4: 38 (threshold: 0.900)\n  Class 5: 114 (threshold: 0.600)\n  Class 6: 212 (threshold: 0.550)\n  Class 7: 182 (threshold: 0.650)\n  Class 8: 63 (threshold: 0.800)\n  Class 9: 42 (threshold: 0.750)\n  Class 10: 31 (threshold: 0.500)\n  Class 11: 42 (threshold: 0.775)\n  Class 12: 37 (threshold: 0.900)\n  Class 13: 42 (threshold: 0.650)\n  Class 14: 117 (threshold: 0.775)\n  Class 15: 59 (threshold: 0.650)\n  Class 16: 82 (threshold: 0.700)\n\n[UK | VGG-Face] summary:\nPolitician: 8498\nPos

In [3]:
uk_classes_encoded = {0: 'Adrian_Ramsay', 1: 'Carla_Denyer', 2: 'Colum_Eastwood', 3: 'Doug_Beattie', 4: 'Ed_Davey', 
 5: 'Gavin_Robinson', 6: 'George_Galloway', 7: 'Jim_Allister', 8: 'John_Swinney',
 9: 'Keir_Starmer', 10: 'Lorna_Slater', 11: 'Mary_Lou_McDonald', 12: 'Naomi_Long',
 13: 'Nigel_Farage', 14: 'Patrick_Harvie', 15: 'Rhun_ap_Iorwerth', 16: 'Rishi_Sunak'}

In [16]:
copy_classified_images_flex(
    country='NL',
    embedding_model='VGG-Face',
    classified_embedding_dir='datasets/classified_embeddings',
    image_dir='datasets/news_images_classified'
)

copy_classified_images_flex(
    country='UK',
    embedding_model='VGG-Face',
    classified_embedding_dir='datasets/classified_embeddings',
    image_dir='datasets/news_images_classified'
)

### Manual correction with drag/drop

In [18]:
time_taken_with_correction = {
    'NL': {
        'politician_classes': {
            'hours': 0,
            'mins': 18,
            'secs':35
        },
        'possible_class':{
            'hours': 0,
            'mins': 5,
            'secs':35
        }
    },
    'UK': {
        'politician_classes': {
            'hours': 1,
            'mins': 8,
            'secs': 16
        },
        'possible_class':{
            'hours': 0,
            'mins': 25,
            'secs':49
        }
    }
}