In [39]:
import pandas as pd
import numpy as np
import re
import os
import io
import ast
import requests
import random
import time
from PIL import Image

from collections import defaultdict

Fetch datasets


In [40]:
countries = ['NL', 'UK']

In [41]:
# NL
nl_text_processed = pd.read_parquet('datasets/NL_articles_text_processed.parquet')
nl_classified_img = pd.read_parquet('../d__news-images-classification-and-correction/datasets/NL_corrected_classifications_VGG-Face_grouped_by_id.parquet')
nl_classified_img = nl_classified_img.rename(columns={'true_label': 'politician_in_img'})
nl_classified_img['politician_in_img'] = nl_classified_img['politician_in_img'].tolist()
nl_classified_img['politician_in_img'] = nl_classified_img['politician_in_img'].apply(lambda x: tuple(sorted(x)))
nl_classified_img['country'] = 'NL'

# UK
uk_text_processed = pd.read_parquet('datasets/UK_articles_text_processed.parquet')
uk_classified_img = pd.read_parquet('../d__news-images-classification-and-correction/datasets/UK_corrected_classifications_VGG-Face_grouped_by_id.parquet')
uk_classified_img = uk_classified_img.rename(columns={'true_label': 'politician_in_img'})
uk_classified_img['politician_in_img'] = uk_classified_img['politician_in_img'].tolist()
uk_classified_img['politician_in_img'] = uk_classified_img['politician_in_img'].apply(lambda x: tuple(sorted(x)))
uk_classified_img['outlet'] = uk_classified_img['outlet'].apply(lambda x: 'The Guardian' if x == 'TheGuardian' else x)
uk_classified_img['country'] = 'UK'

In [42]:
#uk_text_processed.to_excel('check_for_images_27aug.xlsx')

In [43]:
print(f'NL data summary:')
print(f'The data with processed text from the news articles is of shape {nl_text_processed.shape[0]}.')
print(f'The data with classified and corrected labels of politicians in news images is of shape {nl_classified_img.shape[0]}\nand has columns {nl_classified_img.columns.to_list()}.')
print('* * * * * ')
print(f'UK data summary:')
print(f'The data with processed text from the news articles is of shape {uk_text_processed.shape[0]}.')
print(f'The data with classified and corrected labels of politicians in news images is of shape {uk_classified_img.shape[0]}\nand has columns {uk_classified_img.columns.to_list()}.')

NL data summary:
The data with processed text from the news articles is of shape 11915.
The data with classified and corrected labels of politicians in news images is of shape 283
and has columns ['id_unique', 'id', 'outlet', 'politician_in_img', 'country'].
* * * * * 
UK data summary:
The data with processed text from the news articles is of shape 82777.
The data with classified and corrected labels of politicians in news images is of shape 1139
and has columns ['id_unique', 'id', 'outlet', 'politician_in_img', 'country'].


In [44]:
uk_classified_img['politician_in_img'].value_counts()

politician_in_img
(Keir_Starmer,)                                                                                       417
(Rishi_Sunak,)                                                                                        326
(Nigel_Farage,)                                                                                       110
(John_Swinney,)                                                                                        61
(Ed_Davey,)                                                                                            48
(Keir_Starmer, Rishi_Sunak)                                                                            35
(Carla_Denyer,)                                                                                        19
(Mary_Lou_McDonald,)                                                                                   13
(Naomi_Long,)                                                                                          13
(Adrian_Ramsay,)            

Helper functions

In [45]:
def sanitize_filename(s):
    """Remove/replace invalid filename characters."""
    return re.sub(r'[<>:"/\\|?*]', '_', s)

def download_img_from_url(folder, dataframe, id_column, clf_column, url_column):
    '''
    Function that takes the url and downloads the image with the classification as filename.
    :folder: folder to which image must be written
    :dataframe: dataframe in which the image urls are stored
    :id_column: column that holds the unique identifer of the item, to be added to the filename
    :clf_column: column that holds the lists with classifications, to be added to the filename for easy checking
    :url_column: column that holds the url information, which need to be opened and scraped
    '''
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    for i, row in dataframe.iterrows():
        time.sleep(random.uniform(0.5, 1))

        url = row[url_column]

        clf_value = row[clf_column]
        if isinstance(clf_value, (list, tuple)):
            clf_str = " & ".join(map(str, clf_value))
        else:
            clf_str = str(clf_value)

        clf_str = sanitize_filename(clf_str)
        filename = f'{row[id_column]}__{clf_str}.jpg'
        file_path = os.path.join(folder, filename) 
    
        try:
            image_content = requests.get(url, timeout=60).content
            image_file = io.BytesIO(image_content)
            image = Image.open(image_file)

            with open(file_path, "wb") as f:
                image.save(f, "JPEG")

        except Exception as e:
            print(f"Error downloading image from {url}: {str(e)}")

Merge classifications with text_processed df

In [46]:
nl_articles_img = pd.merge(nl_text_processed, nl_classified_img, on=['id_unique', 'id', 'outlet', 'country'], how='left')
nl_articles_img.sort_values(by=['country', 'outlet', 'datetime'], ascending=[True, True, True], inplace=True)

uk_articles_img = pd.merge(uk_text_processed, uk_classified_img, on=['id_unique', 'id', 'outlet', 'country'], how='left')
uk_articles_img.sort_values(by=['country', 'outlet', 'datetime'], ascending=[True, True, True], inplace=True)

In [47]:
print(nl_articles_img.columns.to_list())

['country', 'outlet', 'id', 'url', 'images', 'datetime', 'category', 'title', 'paragraphs', 'alt_txt', 'id_unique', 'PVV__title', 'GL-PvdA__title', 'VVD__title', 'NSC__title', 'D66__title', 'BBB__title', 'CDA__title', 'SP__title', 'FVD__title', 'PvdD__title', 'CU__title', 'SGP__title', 'DENK__title', 'Volt__title', 'JA21__title', 'Bij1__title', 'BvNL__title', 'Positions__title', 'Politics__title', 'Issues__title', 'National__title', 'International__title', 'PVV__paragraphs', 'GL-PvdA__paragraphs', 'VVD__paragraphs', 'NSC__paragraphs', 'D66__paragraphs', 'BBB__paragraphs', 'CDA__paragraphs', 'SP__paragraphs', 'FVD__paragraphs', 'PvdD__paragraphs', 'CU__paragraphs', 'SGP__paragraphs', 'DENK__paragraphs', 'Volt__paragraphs', 'JA21__paragraphs', 'Bij1__paragraphs', 'BvNL__paragraphs', 'Positions__paragraphs', 'Politics__paragraphs', 'Issues__paragraphs', 'National__paragraphs', 'International__paragraphs', 'PVV__alt_txt', 'GL-PvdA__alt_txt', 'VVD__alt_txt', 'NSC__alt_txt', 'D66__alt_txt', 

Two step validation <br>
1. Exhaustiveness: do the labels cover all the politicians in the classified image?

In [48]:
# NL
nl_pol_only = nl_articles_img[~nl_articles_img['politician_in_img'].isna()].copy()
nl_clf_download_folder = 'datasets/image_validation/NL/classified_img_to_check__exhaustive'

#download_img_from_url(nl_clf_download_folder, nl_pol_only, 'id_unique', 'politician_in_img', 'images')

In [49]:
# UK
uk_pol_only = uk_articles_img[~uk_articles_img['politician_in_img'].isna()].copy()
uk_clf_download_folder = 'datasets/image_validation/UK/classified_img_to_check__exhaustive'

#download_img_from_url(uk_clf_download_folder, uk_pol_only, 'id_unique', 'politician_in_img', 'images')

2. Inclusiveness: were politicians missed for the articles with high likelihood of showing a politician in the image?

In [50]:
nl_columns_with_title_or_alt_txt = [
    col for col in nl_articles_img.columns
    if ('_title' in col or '_alt_txt' in col) and
       not ('Positions' in col or 'Politics' in col or 'Issues' in col or 'National' in col or 'International' in col)
]

uk_columns_with_title_or_alt_txt = [
    col for col in uk_articles_img.columns
    if ('_title' in col or '_alt_txt' in col) and
       not ('Positions' in col or 'Politics' in col or 'Issues' in col or 'National' in col or 'International' in col)
]

def is_non_empty_sequence(x):
    if isinstance(x, (list, np.ndarray)):
        return len(x) > 0
    return False

In [51]:
# NL
nl_no_clf_in_img = nl_articles_img[nl_articles_img['politician_in_img'].isna()].copy()
nl_no_clf_in_img = nl_no_clf_in_img[~nl_no_clf_in_img['images'].isna()]
nl_no_clf_in_img = nl_no_clf_in_img[nl_no_clf_in_img['images'] != '']

nl_rows_with_non_empty_lists = (
    nl_no_clf_in_img[nl_columns_with_title_or_alt_txt]
    .map(is_non_empty_sequence)
    .any(axis=1)
)

nl_no_clf_to_check = nl_no_clf_in_img[nl_rows_with_non_empty_lists].copy()
print(f'There are {nl_no_clf_to_check.shape[0]} articles where a politician or party is mentioned in the title or image alt_txt,\nbut for which no politician was detected.\nThese images need an additional check to see whether there is really no politician in the image.')

nl_no_clf_download_folder = 'datasets/image_validation/NL/no_clf_img_to_check__inclusive'

#download_img_from_url(nl_no_clf_download_folder, nl_no_clf_to_check, 'id_unique', 'politician_in_img', 'images')
print('* * * * * ')

There are 295 articles where a politician or party is mentioned in the title or image alt_txt,
but for which no politician was detected.
These images need an additional check to see whether there is really no politician in the image.
* * * * * 


In [52]:
# UK
uk_no_clf_in_img = uk_articles_img[uk_articles_img['politician_in_img'].isna()].copy()
uk_no_clf_in_img = uk_no_clf_in_img[
    (~uk_no_clf_in_img['images'].isna()) &  
    (uk_no_clf_in_img['images'] != 'NA')    
]
uk_rows_with_non_empty_lists = (
    uk_no_clf_in_img[uk_columns_with_title_or_alt_txt]
    .map(is_non_empty_sequence)
    .any(axis=1)
)

uk_no_clf_to_check = uk_no_clf_in_img[uk_rows_with_non_empty_lists].copy()
print(f'There are {uk_no_clf_to_check.shape[0]} articles where a politician or party is mentioned in the title or image alt_txt,\nbut for which no politician was detected.\nThese images need an additional check to see whether there is really no politician in the image.')

uk_no_clf_download_folder = 'datasets/image_validation/UK/no_clf_img_to_check__inclusive'

#download_img_from_url(uk_no_clf_download_folder, uk_no_clf_to_check, 'id_unique', 'politician_in_img', 'images')
print('* * * * * ')

There are 3877 articles where a politician or party is mentioned in the title or image alt_txt,
but for which no politician was detected.
These images need an additional check to see whether there is really no politician in the image.
* * * * * 


### Validation

In [None]:
time_taken_with_validation = {
    'NL': {
        'validation_exhaustive': {
            'hours': 0,
            'mins': 16,
            'secs': 59
        },
        'validation_inclusive':{
            'hours': 0,
            'mins': 11,
            'secs': 27
        }
    },
    'UK': {
        'validation_exhaustive': {
            'hours': 0,
            'mins': 8,
            'secs': 27
        },
        'validation_inclusive':{
            'hours': 0,
            'mins': 33,  
            'secs': 10 
        }
    }
}

#### NL

In [54]:
# NL
nl_labels_to_add_exhaustive = [
    {
        'id_unique': 'id00848',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id00874',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id01137',
        'politician_to_add': ['Pieter_Omtzigt', 'Caroline_van_der_Plas']
    },
    {
        'id_unique': 'id01948',
        'politician_to_add': ['Pieter_Omtzigt', 'Caroline_van_der_Plas']
    },
    {
        'id_unique': 'id02038',
        'politician_to_add': ['Frans_Timmermans', 'Rob_Jetten', 'Geert_Wilders']
    },
    {
        'id_unique': 'id02055',
        'politician_to_add': ['Rob_Jetten', 'Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id02147',
        'politician_to_add': ['Henri_Bontenbal', 'Geert_Wilders']
    },
    {
        'id_unique': 'id02197',
        'politician_to_add': ['Frans_Timmermans']
    },
    {
        'id_unique': 'id027868',
        'politician_to_add': ['Mirjam_Bikker', 'Wybren_van_Haga', 'Caroline_van_der_Plas']
    },
    {
        'id_unique': 'id06489',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id06667',
        'politician_to_add': ['Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id06772',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id06914',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id07243',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id07753',
        'politician_to_add': ['Lilian_Marijnissen']
    },
    {
        'id_unique': 'id09857',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id10086',
        'politician_to_add': ['Pieter_Omtzigt', 'Caroline_van_der_Plas']
    },
    {
        'id_unique': 'id10284',
        'politician_to_add': ['Caroline_van_der_Plas']
    },
    {
        'id_unique': 'id11191',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id11247',
        'politician_to_add': ['Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id11328',
        'politician_to_add': ['Geert_Wilders']
    },
    {
        'id_unique': 'id11376',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id11689',
        'politician_to_add': ['Lilian_Marijnissen', 'Caroline_van_der_Plas', 'Rob_Jetten']
    }
]

nl_labels_to_add_inclusive = [
    {
        'id_unique': 'id00557',
        'politician_to_add': ['Thierry_Baudet']
    },
    {
        'id_unique': 'id00583',
        'politician_to_add': ['Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id01665',
        'politician_to_add': ['Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id02606',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id03448',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id03685',
        'politician_to_add': ['Rob_Jetten']
    },
    {
        'id_unique': 'id03729',
        'politician_to_add': ['Geert_Wilders', 'Pieter_Omtzigt']
    },
    {
        'id_unique': 'id04457',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id05000',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id05173',
        'politician_to_add': ['Esther_Ouwehand']
    },
    {
        'id_unique': 'id05176',
        'politician_to_add': ['Esther_Ouwehand']
    },
    {
        'id_unique': 'id05889',
        'politician_to_add': ['Rob_Jetten']
    },
    {
        'id_unique': 'id06299',
        'politician_to_add': ['Laurens_Dassen']
    },
    {
        'id_unique': 'id06461',
        'politician_to_add': ['Joost_Eerdmans']
    },
    {
        'id_unique': 'id06602',
        'politician_to_add': ['Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id07050',
        'politician_to_add': ['Thierry_Baudet']
    },
    {
        'id_unique': 'id07088',
        'politician_to_add': ['Thierry_Baudet']
    },
    {
        'id_unique': 'id09057',
        'politician_to_add': ['Esther_Ouwehand']
    },
    {
        'id_unique': 'id10312',
        'politician_to_add': ['Caroline_van_der_Plas', 'Pieter_Omtzigt', 'Dilan_Yesilgoz']
    },
    {
        'id_unique': 'id10634',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id10789',
        'politician_to_add': ['Pieter_Omtzigt']
    },
    {
        'id_unique': 'id10926',
        'politician_to_add': ['Geert_Wilders']
    },
    {
        'id_unique': 'id11257',
        'politician_to_add': ['Geert_Wilders']
    }
]

In [55]:
nl_exhaustive_df = pd.DataFrame([{'validation': 'validation1_exhaustive', **item} for item in nl_labels_to_add_exhaustive])
nl_inclusive_df = pd.DataFrame([{'validation': 'validation2_inclusive', **item} for item in nl_labels_to_add_inclusive])

nl_validation_labels_to_add = pd.concat([nl_exhaustive_df, nl_inclusive_df], ignore_index=True)

print(f'For the validation of NL, {nl_validation_labels_to_add["politician_to_add"].apply(len).sum()} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(nl_exhaustive_df)} articles with in total {nl_exhaustive_df["politician_to_add"].apply(len).sum()} labels.')
print(f'The second validation step, concerning inclusiveness: {len(nl_inclusive_df)} articles with in total {nl_inclusive_df["politician_to_add"].apply(len).sum()} labels.')
print(f'The resulting df is of shape {nl_validation_labels_to_add.shape} and has columns: {nl_validation_labels_to_add.columns.to_list()}.')

For the validation of NL, 60 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 23 articles with in total 34 labels.
The second validation step, concerning inclusiveness: 23 articles with in total 26 labels.
The resulting df is of shape (46, 3) and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [56]:
nl_validation_labels_to_add.to_parquet('datasets/image_validation/NL/NL_validation_incl_exh_labels_to_add.parquet')

### UK

In [57]:
uk_labels_to_add_exhaustive = [
    {
        'id_unique': 'id33997',
        'politician_to_add': ['Keir_Starmer']
    },
    {
        'id_unique': 'id39937',
        'politician_to_add': ['Naomi_Long', 'Gavin_Robinson']
    },
    {
        'id_unique': 'id41702',
        'politician_to_add': ['John_Swinney']
    },
    {
        'id_unique': 'id42135',
        'politician_to_add': ['Rishi_Sunak']
    },
    {
        'id_unique': 'id42340',
        'politician_to_add': ['Rishi_Sunak']
    },
    {
        'id_unique': 'id43176',
        'politician_to_add': ['Keir_Starmer']
    },
    {
        'id_unique': 'id43305',
        'politician_to_add': ['Gavin_Robinson']
    }
]

In [58]:
uk_labels_to_add_inclusive = defaultdict(set)
check_dir = "datasets/image_validation/UK/temp_inclusive_check"

for politician_name in os.listdir(check_dir):
    politician_path = os.path.join(check_dir, politician_name)

    if os.path.isdir(politician_path):
        for file_name in os.listdir(politician_path):
            if file_name.endswith(".jpg"):
                id_unique = file_name.split("__")[0]

                uk_labels_to_add_inclusive[id_unique].add(politician_name)

# Convert back to your desired list-of-dicts format
uk_labels_to_add_inclusive = [
    {"id_unique": id_unique, "politician_to_add": list(politicians)}
    for id_unique, politicians in uk_labels_to_add_inclusive.items()
]

In [None]:
uk_exhaustive_df = pd.DataFrame([{'validation': 'validation1_exhaustive', **item} for item in uk_labels_to_add_exhaustive])
uk_inclusive_df = pd.DataFrame([{'validation': 'validation2_inclusive', **item} for item in uk_labels_to_add_inclusive])

uk_validation_labels_to_add = pd.concat([uk_exhaustive_df, uk_inclusive_df], ignore_index=True)

print(f'For the validation of UK, {uk_validation_labels_to_add["politician_to_add"].apply(len).sum()} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(uk_exhaustive_df)} articles with in total {uk_exhaustive_df["politician_to_add"].apply(len).sum()} labels.')
print(f'The second validation step, concerning inclusiveness: {len(uk_inclusive_df)} articles with in total {uk_inclusive_df["politician_to_add"].apply(len).sum()} labels.')
print(f'The resulting df is of shape {uk_validation_labels_to_add.shape} and has columns: {uk_validation_labels_to_add.columns.to_list()}.')

For the validation of NL, 135 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 7 articles with in total 8 labels.
The second validation step, concerning inclusiveness: 119 articles with in total 127 labels.
The resulting df is of shape (126, 3) and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [60]:
uk_validation_labels_to_add.to_parquet('datasets/image_validation/UK/UK_validation_incl_exh_labels_to_add.parquet')

### Finalize data for both countries

In [61]:
nl_validation_labels_to_add['politician_to_add'] = nl_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))
uk_validation_labels_to_add['politician_to_add'] = uk_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))

In [62]:
country_data_dict = {
    'NL': {
        'classified_df': nl_articles_img,
        'validated_labels_to_add': nl_validation_labels_to_add
    },
    'UK': {
        'classified_df': uk_articles_img,
        'validated_labels_to_add': uk_validation_labels_to_add
    }
}

In [63]:
def add_tuples_if_empty(x, y):
    if pd.notna(x) and isinstance(x, tuple):
        if pd.notna(y) and isinstance(y, tuple):
            return x + y
        else:
            return x
    else:
        if pd.notna(y) and isinstance(y, tuple):
            return y
        else:
            return x
        
def finalize_validated_df(data_dict, country):
    classified_df = data_dict[country]['classified_df']
    validated_labels_to_add = data_dict[country]['validated_labels_to_add']
    final_df = classified_df.merge(validated_labels_to_add, on='id_unique', how='left')
    final_df['politicians_depicted'] = final_df.apply(lambda row: add_tuples_if_empty(row['politician_in_img'], row['politician_to_add']), axis=1)
    
    return final_df

In [64]:
nl_final_parquet = finalize_validated_df(country_data_dict, 'NL')
nl_final_parquet.to_parquet('datasets/NL_final_df_multi_modal.parquet')
uk_final_parquet = finalize_validated_df(country_data_dict, 'UK')
uk_final_parquet.to_parquet('datasets/UK_final_df_multi_modal.parquet')

### Inspect

In [65]:
# Quick inspection 
nl_new_count = nl_final_parquet['politicians_depicted'].value_counts()
nl_old_count = nl_articles_img['politician_in_img'].value_counts()
nl_compare = pd.concat([nl_new_count, nl_old_count], axis=1)
print(nl_compare)

# Quick inspection 
pd.set_option("display.max_rows", None)
uk_new_count = uk_final_parquet['politicians_depicted'].value_counts()
uk_old_count = uk_articles_img['politician_in_img'].value_counts()
uk_compare = pd.concat([uk_new_count, uk_old_count], axis=1)
print(uk_compare)

                                                    count  count
(Geert_Wilders,)                                     44.0   46.0
(Dilan_Yesilgoz,)                                    34.0   33.0
(Esther_Ouwehand,)                                   30.0   27.0
(Pieter_Omtzigt,)                                    29.0   25.0
(Thierry_Baudet,)                                    25.0   22.0
(Caroline_van_der_Plas,)                             16.0   18.0
(Lilian_Marijnissen,)                                16.0   16.0
(Rob_Jetten,)                                        16.0   15.0
(Frans_Timmermans,)                                   9.0   10.0
(Henri_Bontenbal,)                                    7.0    7.0
(Geert_Wilders, Pieter_Omtzigt)                       7.0    3.0
(Stephan_van_Baarle,)                                 5.0    5.0
(Mirjam_Bikker,)                                      3.0    3.0
(Laurens_Dassen,)                                     3.0    2.0
(Edson_Olf,)             

Manual validation [BACKUP UNKNOWN AS POSITIVE]

In [16]:
# NL
nl_labels_to_add_exhaustive = [
    {
        'id_unique': 'id00874',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id01137',
        'politician_to_add': ['Caroline van der Plas', 'Pieter Omtzigt']
    },
    {
        'id_unique': 'id01948',
        'politician_to_add': ['Caroline van der Plas', 'Pieter Omtzigt']
    },
    {
        'id_unique': 'id02038',
        'politician_to_add': ['Frans Timmermans', 'Rob Jetten', 'Geert Wilders']
    },
    {
        'id_unique': 'id02055',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id02101',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id02147',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id02197',
        'politician_to_add': ['Frans Timmermans']
    },
    {
        'id_unique': 'id02768',
        'politician_to_add': ['Wybren van Haga', 'Caroline van der Plas']
    },
    {
        'id_unique': 'id02876',
        'politician_to_add': ['Pieter Omtzigt']
    },
    {
        'id_unique': 'id03729',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id11403',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id10312',
        'politician_to_add': ['Caroline van der Plas', 'Dilan Yesilgoz']
    },
    {
        'id_unique': 'id06930',
        'politician_to_add': ['Laurens Dassen', 'Joost Eerdmans']
    },
    {
        'id_unique': 'id10312',
        'politician_to_add': ['Caroline van der Plas', 'Dilan Yesilgoz']
    },
    {
        'id_unique': 'id06914',
        'politician_to_add': ['Pieter Omtzigt']
    }
]

nl_labels_to_add_inclusive = [
    {
        'id_unique': 'id02369',
        'politician_to_add': ['Caroline van der Plas']
    },
    {
        'id_unique': 'id05736',
        'politician_to_add': ['Thierry Baudet']
    },
    {
        'id_unique': 'id06299',
        'politician_to_add': ['Laurens Dassen']
    },
    {
        'id_unique': 'id06461',
        'politician_to_add': ['Joost Eerdmans']
    },
    {
        'id_unique': 'id06602',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id10789',
        'politician_to_add': ['Pieter Omtzigt']
    },
    {
        'id_unique': 'id11364',
        'politician_to_add': ['Thierry Baudet']
    },
    {
        'id_unique': 'id11364',
        'politician_to_add': ['Thierry Baudet']
    }
]

In [17]:
nl_exhaustive_labels_to_add = [{'validation': 'validation1_exhaustive', **item} for item in nl_labels_to_add_exhaustive]
nl_inclusive_labels_to_add = [{'validation': 'validation2_inclusive', **item} for item in nl_labels_to_add_inclusive]

nl_validated = nl_exhaustive_labels_to_add + nl_inclusive_labels_to_add
nl_validation_labels_to_add = pd.DataFrame(nl_validated)
nl_validation_labels_to_add['politician_to_add'] = nl_validation_labels_to_add['politician_to_add'].apply(lambda x: [name.replace(' ', '_') for name in x])

In [18]:
print(f'For the validation of NL, {nl_validation_labels_to_add.shape[0]} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(nl_exhaustive_labels_to_add)}.')
print(f'The second validation step, concerning inclusiveness: {len(nl_inclusive_labels_to_add)}.')
print(f'The resulting df is of shape {nl_validation_labels_to_add.shape[0]} and has columns: {nl_validation_labels_to_add.columns.to_list()}.')

For the validation of NL, 24 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 16.
The second validation step, concerning inclusiveness: 8.
The resulting df is of shape 24 and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [19]:
nl_validation_labels_to_add.to_parquet('datasets/image_validation/NL/NL_validation_incl_exh_labels_to_add.parquet')

In [73]:
print(nl_validation_labels_to_add.groupby(by=['validation', 'politician_to_add']).count())

                                                                                id_unique
validation             politician_to_add                                                 
validation1_exhaustive (Caroline_van_der_Plas,)                                         1
                       (Caroline_van_der_Plas, Lilian_Marijnissen, Rob_Jetten)          1
                       (Caroline_van_der_Plas, Mirjam_Bikker, Wybren_van_Haga)          1
                       (Caroline_van_der_Plas, Pieter_Omtzigt)                          3
                       (Dilan_Yesilgoz,)                                                2
                       (Dilan_Yesilgoz, Rob_Jetten)                                     1
                       (Frans_Timmermans,)                                              1
                       (Frans_Timmermans, Geert_Wilders, Rob_Jetten)                    1
                       (Geert_Wilders,)                                                 1
          

In [20]:
# UK
uk_labels_to_add_exhaustive = [
    {
        'id_unique': 'id19858',
        'politician_to_add': ['Patrick Harvie']
    },
    {
        'id_unique': 'id39937',
        'politician_to_add': ['Colum Eastwood', 'Gavin Robinson']
    },
    {
        'id_unique': 'id42135',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id42340',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id43176',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74779',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id75904',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76384',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id76521',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76597',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76720',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77372',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id77834',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79075',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79757',
        'politician_to_add': ['Adrian Ramsay']
    },
    {
        'id_unique': 'id80655',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id81180',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id81352',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83362',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id88083',
        'politician_to_add': ['Keir Starmer']
    }
]

# UK
uk_labels_to_add_inclusive = [
    {
        'id_unique': 'id13023',
        'politician_to_add': ['Gavin Robinson']
    },
    {
        'id_unique': 'id17122',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id17389',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id20538',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id23559',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id26189',
        'politician_to_add': ['Mary Lou McDonald']
    },
    {
        'id_unique': 'id28862',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id29836',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id29875',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id33075',
        'politician_to_add': ['John Swinney']
    },
    {
        'id_unique': 'id33799',
        'politician_to_add': ['Keir Starmer', 'Rishi Sunak']
    },
    {
        'id_unique': 'id36487',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id36610',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id39128',
        'politician_to_add': ['Naomi Long']
    },
    {
        'id_unique': 'id44211',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id44546',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id45134',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id45899',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id46507',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id59144',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id63477',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id63983',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id64186',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id67241',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id70683',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id71254',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id71618',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id72059',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id72097',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id72481',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id73714',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74357',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74889',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id74957',
        'politician_to_add': ['Mary Lou McDonald']
    },
    {
        'id_unique': 'id75081',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75225',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75606',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75827',
        'politician_to_add': ['John Swinney']
    },
    {
        'id_unique': 'id76039',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id76216',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76346',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76719',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77046',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77139',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77191',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77550',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77549',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77953',
        'politician_to_add': ['Adrian Ramsay']
    },
    {
        'id_unique': 'id78786',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79025',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79264',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79699',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79824',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79847',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79949',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80176',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id80478',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80711',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80762',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80855',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id81118',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id816277',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id82379',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83271',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83354',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id83535',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83536',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id83687',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id84581',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id84646',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id85744',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id86038',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id87084',
        'politician_to_add': ['George Galloway']
    },
    {
        'id_unique': 'id87345',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id87751',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id88002',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id88454',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id89545',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id89560',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id89595',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id89923',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id90052',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id90271',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id91370',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id91461',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id91528',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id92083',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id92088',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id92712',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id93586',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id93812',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id94040',
        'politician_to_add': ['Carla Denyer', 'Adrian Ramsay']
    }
]

In [21]:
uk_exhaustive_labels_to_add = [{'validation': 'validation1_exhaustive', **item} for item in uk_labels_to_add_exhaustive]
uk_inclusive_labels_to_add = [{'validation': 'validation2_inclusive', **item} for item in uk_labels_to_add_inclusive]

uk_validated = uk_exhaustive_labels_to_add + uk_inclusive_labels_to_add
uk_validation_labels_to_add = pd.DataFrame(uk_validated)
uk_validation_labels_to_add['politician_to_add'] = uk_validation_labels_to_add['politician_to_add'].apply(lambda x: [name.replace(' ', '_') for name in x])

In [22]:
print(f'For the validation of UK, {uk_validation_labels_to_add.shape[0]} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(uk_exhaustive_labels_to_add)}.')
print(f'The second validation step, concerning inclusiveness: {len(uk_inclusive_labels_to_add)}.')
print(f'The resulting df is of shape {uk_validation_labels_to_add.shape[0]} and has columns: {uk_validation_labels_to_add.columns.to_list()}.')

For the validation of UK, 112 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 20.
The second validation step, concerning inclusiveness: 92.
The resulting df is of shape 112 and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [23]:
uk_validation_labels_to_add.to_parquet('datasets/image_validation/UK/UK_validation_incl_exh_labels_to_add.parquet')

In [74]:
print(uk_validation_labels_to_add.groupby(by=['validation', 'politician_to_add']).count())

                                                      id_unique
validation             politician_to_add                       
validation1_exhaustive (Gavin_Robinson,)                      1
                       (Gavin_Robinson, Naomi_Long)           1
                       (John_Swinney,)                        1
                       (Keir_Starmer,)                        2
                       (Rishi_Sunak,)                         2
validation2_inclusive  (Adrian_Ramsay,)                       1
                       (Adrian_Ramsay, Carla_Denyer)          1
                       (Carla_Denyer,)                        1
                       (Carla_Denyer, Nigel_Farage)           4
                       (Doug_Beattie,)                        1
                       (Ed_Davey,)                            5
                       (George_Galloway,)                     1
                       (Jim_Allister,)                        1
                       (John_Swinney,)  

Finalize data for both countries

In [24]:
nl_validation_labels_to_add['politician_to_add'] = nl_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))
uk_validation_labels_to_add['politician_to_add'] = uk_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))

In [25]:
country_data_dict = {
    'NL': {
        'classified_df': nl_articles_img,
        'validated_labels_to_add': nl_validation_labels_to_add
    },
    'UK': {
        'classified_df': uk_articles_img,
        'validated_labels_to_add': uk_validation_labels_to_add
    }
}

In [26]:
def add_tuples_if_empty(x, y):
    if pd.notna(x) and isinstance(x, tuple):
        if pd.notna(y) and isinstance(y, tuple):
            return x + y
        else:
            return x
    else:
        if pd.notna(y) and isinstance(y, tuple):
            return y
        else:
            return x

In [27]:
def finalize_validated_df(data_dict, country):
    classified_df = data_dict[country]['classified_df']
    validated_labels_to_add = data_dict[country]['validated_labels_to_add']
    final_df = classified_df.merge(validated_labels_to_add, on='id_unique', how='left')
    final_df['politicians_depicted'] = final_df.apply(lambda row: add_tuples_if_empty(row['politician_in_img'], row['politician_to_add']), axis=1)
    
    return final_df

In [28]:
nl_final_parquet = finalize_validated_df(country_data_dict, 'NL')
nl_final_parquet.to_parquet('datasets/NL_final_df_multi_modal.parquet')
uk_final_parquet = finalize_validated_df(country_data_dict, 'UK')
uk_final_parquet.to_parquet('datasets/UK_final_df_multi_modal.parquet')


In [29]:
''' # Quick inspection 
nl_new_count = nl_final_parquet['politicians_depicted'].value_counts()
nl_old_count = nl_articles_img['politician_in_img'].value_counts()
nl_compare = pd.concat([nl_new_count, nl_old_count], axis=1)
print(nl_compare)'''

" # Quick inspection \nnl_new_count = nl_final_parquet['politicians_depicted'].value_counts()\nnl_old_count = nl_articles_img['politician_in_img'].value_counts()\nnl_compare = pd.concat([nl_new_count, nl_old_count], axis=1)\nprint(nl_compare)"

In [30]:
''' # Quick inspection 
pd.set_option("display.max_rows", None)
uk_new_count = uk_final_parquet['politicians_depicted'].value_counts()
uk_old_count = uk_articles_img['politician_in_img'].value_counts()
uk_compare = pd.concat([uk_new_count, uk_old_count], axis=1)
print(uk_compare)'''

' # Quick inspection \npd.set_option("display.max_rows", None)\nuk_new_count = uk_final_parquet[\'politicians_depicted\'].value_counts()\nuk_old_count = uk_articles_img[\'politician_in_img\'].value_counts()\nuk_compare = pd.concat([uk_new_count, uk_old_count], axis=1)\nprint(uk_compare)'