In [1]:
import pandas as pd
import numpy as np
import re
import os
import io
import ast
import requests
import random
import time
from PIL import Image

Fetch datasets


In [2]:
# NL
nl_text_processed = pd.read_parquet('datasets/NL_articles_text_processed.parquet')
nl_classified_img = pd.read_parquet('../d__nos-nu-news-images-classification/datasets/NL_corrected_classifications_FN512_grouped_by_id.parquet')
nl_classified_img = nl_classified_img.rename(columns={'true_label': 'politician_in_img'})
nl_classified_img['politician_in_img'] = nl_classified_img['politician_in_img'].tolist()
nl_classified_img['politician_in_img'] = nl_classified_img['politician_in_img'].apply(lambda x: tuple(sorted(x)))

# UK
uk_text_processed = pd.read_parquet('datasets/UK_articles_text_processed.parquet')
uk_classified_img = pd.read_parquet('../d__nos-nu-news-images-classification/datasets/UK_corrected_classifications_FN512_grouped_by_id.parquet')
uk_classified_img = uk_classified_img.rename(columns={'true_label': 'politician_in_img'})
uk_classified_img['politician_in_img'] = uk_classified_img['politician_in_img'].tolist()
uk_classified_img['politician_in_img'] = uk_classified_img['politician_in_img'].apply(lambda x: tuple(sorted(x)))

In [3]:
print(f'NL data summary:')
print(f'The data with processed text from the news articles is of shape {nl_text_processed.shape[0]}.')
print(f'The data with classified and corrected labels of politicians in news images is of shape {nl_classified_img.shape[0]}\nand has columns {nl_classified_img.columns.to_list()}.')
print('* * * * * ')
print(f'UK data summary:')
print(f'The data with processed text from the news articles is of shape {uk_text_processed.shape[0]}.')
print(f'The data with classified and corrected labels of politicians in news images is of shape {uk_classified_img.shape[0]}\nand has columns {uk_classified_img.columns.to_list()}.')

NL data summary:
The data with processed text from the news articles is of shape 11915.
The data with classified and corrected labels of politicians in news images is of shape 300
and has columns ['id_unique', 'id', 'outlet', 'country', 'politician_in_img'].
* * * * * 
UK data summary:
The data with processed text from the news articles is of shape 82777.
The data with classified and corrected labels of politicians in news images is of shape 1146
and has columns ['id_unique', 'id', 'outlet', 'country', 'politician_in_img'].


In [4]:
nl_classified_img['politician_in_img'].dtype

dtype('O')

Helper functions

In [5]:
def download_img_from_url(folder, dataframe, id_column, clf_column, url_column):
    '''
    Function that takes the url and downloads the image with the classification as filename.
    :folder: folder to which image must be written
    :dataframe: dataframe in which the image urls are stored
    :id_column: column that holds the unique identifer of the item, to be added to the filename
    :clf_column: column that holds the lists with classifications, to be added to the filename for easy checking
    :url_column: column that holds the url information, which need to be opened and scraped
    '''
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    for i, row in dataframe.iterrows():
        time.sleep(random.uniform(0.5, 1))
        url = row[url_column]
        filename = f'{row[id_column]}__{row[clf_column]}.jpg'
        file_path = os.path.join(folder, filename) 
    
        try:
            image_content = requests.get(url, timeout=60).content
            image_file = io.BytesIO(image_content)
            image = Image.open(image_file)

            with open(file_path, "wb") as f:
                image.save(f, "JPEG")

        except Exception as e:
            print(f"Error downloading image from {url}: {str(e)}")

Merge classifications with text_processed df

In [6]:
nl_articles_img = pd.merge(nl_text_processed, nl_classified_img, on=['id_unique', 'id', 'outlet', 'country'], how='left')
nl_articles_img.sort_values(by=['country', 'outlet', 'datetime'], ascending=[True, True, True], inplace=True)

uk_articles_img = pd.merge(uk_text_processed, uk_classified_img, on=['id_unique', 'id', 'outlet', 'country'], how='left')
uk_articles_img.sort_values(by=['country', 'outlet', 'datetime'], ascending=[True, True, True], inplace=True)

In [7]:
print(nl_articles_img.columns.to_list())

['country', 'outlet', 'id', 'url', 'images', 'datetime', 'category', 'title', 'paragraphs', 'alt_txt', 'id_unique', 'PVV__title', 'GL-PvdA__title', 'VVD__title', 'NSC__title', 'D66__title', 'BBB__title', 'CDA__title', 'SP__title', 'FVD__title', 'PvdD__title', 'CU__title', 'SGP__title', 'DENK__title', 'Volt__title', 'JA21__title', 'Bij1__title', 'BvNL__title', 'Positions__title', 'Politics__title', 'Issues__title', 'National__title', 'International__title', 'PVV__paragraphs', 'GL-PvdA__paragraphs', 'VVD__paragraphs', 'NSC__paragraphs', 'D66__paragraphs', 'BBB__paragraphs', 'CDA__paragraphs', 'SP__paragraphs', 'FVD__paragraphs', 'PvdD__paragraphs', 'CU__paragraphs', 'SGP__paragraphs', 'DENK__paragraphs', 'Volt__paragraphs', 'JA21__paragraphs', 'Bij1__paragraphs', 'BvNL__paragraphs', 'Positions__paragraphs', 'Politics__paragraphs', 'Issues__paragraphs', 'National__paragraphs', 'International__paragraphs', 'PVV__alt_txt', 'GL-PvdA__alt_txt', 'VVD__alt_txt', 'NSC__alt_txt', 'D66__alt_txt', 

Two step validation <br>
1. Exhaustiveness: do the labels cover all the politicians in the classified image?

In [8]:
# NL
nl_pol_only = nl_articles_img[~nl_articles_img['politician_in_img'].isna()].copy()
nl_clf_download_folder = 'datasets/image_validation/NL/classified_img_to_check__exhaustive'

#download_img_from_url(nl_clf_download_folder, nl_pol_only, 'id_unique', 'politician_in_img', 'images')

In [9]:
# UK
uk_pol_only = uk_articles_img[~uk_articles_img['politician_in_img'].isna()].copy()
uk_clf_download_folder = 'datasets/image_validation/UK/classified_img_to_check__exhaustive'

#download_img_from_url(uk_clf_download_folder, uk_pol_only, 'id_unique', 'politician_in_img', 'images')

2. Inclusiveness: were politicians missed for the articles with high likelihood of showing a politician in the image?

In [10]:
nl_columns_with_title_or_alt_txt = [
    col for col in nl_articles_img.columns
    if ('_title' in col or '_alt_txt' in col) and
       not ('Positions' in col or 'Politics' in col or 'Issues' in col or 'National' in col or 'International' in col)
]

uk_columns_with_title_or_alt_txt = [
    col for col in uk_articles_img.columns
    if ('_title' in col or '_alt_txt' in col) and
       not ('Positions' in col or 'Politics' in col or 'Issues' in col or 'National' in col or 'International' in col)
]

def is_non_empty_sequence(x):
    if isinstance(x, (list, np.ndarray)):
        return len(x) > 0
    return False

In [11]:
# NL
nl_no_clf_in_img = nl_articles_img[nl_articles_img['politician_in_img'].isna()].copy()
nl_no_clf_in_img = nl_no_clf_in_img[~nl_no_clf_in_img['images'].isna()]

nl_rows_with_non_empty_lists = (
    nl_no_clf_in_img[nl_columns_with_title_or_alt_txt]
    .map(is_non_empty_sequence)
    .any(axis=1)
)

nl_no_clf_to_check = nl_no_clf_in_img[nl_rows_with_non_empty_lists].copy()
print(f'There are {nl_no_clf_to_check.shape[0]} articles where a politician or party is mentioned in the title or image alt_txt,\nbut for which no politician was detected.\nThese images need an additional check to see whether there is really no politician in the image.')

There are 337 articles where a politician or party is mentioned in the title or image alt_txt,
but for which no politician was detected.
These images need an additional check to see whether there is really no politician in the image.


In [12]:
nl_no_clf_download_folder = 'datasets/image_validation/NL/no_clf_img_to_check__inclusive'

#download_img_from_url(nl_no_clf_download_folder, nl_no_clf_to_check, 'id_unique', 'politician_in_img', 'images')
print('* * * * * ')
print(f'Succesfully downloaded the news images that need a final check to {nl_no_clf_download_folder}.')

* * * * * 
Succesfully downloaded the news images that need a final check to datasets/image_validation/NL/no_clf_img_to_check__inclusive.


In [13]:
# UK
uk_no_clf_in_img = uk_articles_img[uk_articles_img['politician_in_img'].isna()].copy()
uk_no_clf_in_img = uk_no_clf_in_img[
    (~uk_no_clf_in_img['images'].isna()) &  
    (uk_no_clf_in_img['images'] != 'NA')    
]
uk_rows_with_non_empty_lists = (
    uk_no_clf_in_img[uk_columns_with_title_or_alt_txt]
    .map(is_non_empty_sequence)
    .any(axis=1)
)

uk_no_clf_to_check = uk_no_clf_in_img[uk_rows_with_non_empty_lists].copy()
print(f'There are {uk_no_clf_to_check.shape[0]} articles where a politician or party is mentioned in the title or image alt_txt,\nbut for which no politician was detected.\nThese images need an additional check to see whether there is really no politician in the image.')

There are 3875 articles where a politician or party is mentioned in the title or image alt_txt,
but for which no politician was detected.
These images need an additional check to see whether there is really no politician in the image.


In [14]:
uk_no_clf_download_folder = 'datasets/image_validation/UK/no_clf_img_to_check__inclusive'

#download_img_from_url(uk_no_clf_download_folder, uk_no_clf_to_check, 'id_unique', 'politician_in_img', 'images')
print('* * * * * ')
print(f'Succesfully downloaded the news images that need a final check to {uk_no_clf_download_folder}.')

* * * * * 
Succesfully downloaded the news images that need a final check to datasets/image_validation/UK/no_clf_img_to_check__inclusive.


Manual validation

In [15]:
# NL
nl_labels_to_add_exhaustive = [
    {
        'id_unique': 'id00874',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id01137',
        'politician_to_add': ['Caroline van der Plas', 'Pieter Omtzigt']
    },
    {
        'id_unique': 'id01948',
        'politician_to_add': ['Caroline van der Plas', 'Pieter Omtzigt']
    },
    {
        'id_unique': 'id02038',
        'politician_to_add': ['Frans Timmermans', 'Rob Jetten', 'Geert Wilders']
    },
    {
        'id_unique': 'id02055',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id02101',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id02147',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id02197',
        'politician_to_add': ['Frans Timmermans']
    },
    {
        'id_unique': 'id02768',
        'politician_to_add': ['Wybren van Haga', 'Caroline van der Plas']
    },
    {
        'id_unique': 'id02876',
        'politician_to_add': ['Pieter Omtzigt']
    },
    {
        'id_unique': 'id03729',
        'politician_to_add': ['Geert Wilders']
    },
    {
        'id_unique': 'id11403',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id10312',
        'politician_to_add': ['Caroline van der Plas', 'Dilan Yesilgoz']
    },
    {
        'id_unique': 'id06930',
        'politician_to_add': ['Laurens Dassen', 'Joost Eerdmans']
    },
    {
        'id_unique': 'id10312',
        'politician_to_add': ['Caroline van der Plas', 'Dilan Yesilgoz']
    },
    {
        'id_unique': 'id06914',
        'politician_to_add': ['Pieter Omtzigt']
    }
]

nl_labels_to_add_inclusive = [
    {
        'id_unique': 'id02369',
        'politician_to_add': ['Caroline van der Plas']
    },
    {
        'id_unique': 'id05736',
        'politician_to_add': ['Thierry Baudet']
    },
    {
        'id_unique': 'id06299',
        'politician_to_add': ['Laurens Dassen']
    },
    {
        'id_unique': 'id06461',
        'politician_to_add': ['Joost Eerdmans']
    },
    {
        'id_unique': 'id06602',
        'politician_to_add': ['Dilan Yesilgoz']
    },
    {
        'id_unique': 'id10789',
        'politician_to_add': ['Pieter Omtzigt']
    },
    {
        'id_unique': 'id11364',
        'politician_to_add': ['Thierry Baudet']
    },
    {
        'id_unique': 'id11364',
        'politician_to_add': ['Thierry Baudet']
    }
]

In [16]:
nl_exhaustive_labels_to_add = [{'validation': 'validation1_exhaustive', **item} for item in nl_labels_to_add_exhaustive]
nl_inclusive_labels_to_add = [{'validation': 'validation2_inclusive', **item} for item in nl_labels_to_add_inclusive]

nl_validated = nl_exhaustive_labels_to_add + nl_inclusive_labels_to_add
nl_validation_labels_to_add = pd.DataFrame(nl_validated)
nl_validation_labels_to_add['politician_to_add'] = nl_validation_labels_to_add['politician_to_add'].apply(lambda x: [name.replace(' ', '_') for name in x])

In [17]:
print(f'For the validation of NL, {nl_validation_labels_to_add.shape[0]} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(nl_exhaustive_labels_to_add)}.')
print(f'The second validation step, concerning inclusiveness: {len(nl_inclusive_labels_to_add)}.')
print(f'The resulting df is of shape {nl_validation_labels_to_add.shape[0]} and has columns: {nl_validation_labels_to_add.columns.to_list()}.')

For the validation of NL, 24 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 16.
The second validation step, concerning inclusiveness: 8.
The resulting df is of shape 24 and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [18]:
nl_validation_labels_to_add.to_parquet('datasets/image_validation/NL/NL_validation_incl_exh_labels_to_add.parquet')

In [19]:
# UK
uk_labels_to_add_exhaustive = [
    {
        'id_unique': 'id19858',
        'politician_to_add': ['Patrick Harvie']
    },
    {
        'id_unique': 'id39937',
        'politician_to_add': ['Colum Eastwood', 'Gavin Robinson']
    },
    {
        'id_unique': 'id42135',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id42340',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id43176',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74779',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id75904',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76384',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id76521',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76597',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76720',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77372',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id77834',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79075',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79757',
        'politician_to_add': ['Adrian Ramsay']
    },
    {
        'id_unique': 'id80655',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id81180',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id81352',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83362',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id88083',
        'politician_to_add': ['Keir Starmer']
    }
]

# UK
uk_labels_to_add_inclusive = [
    {
        'id_unique': 'id13023',
        'politician_to_add': ['Gavin Robinson']
    },
    {
        'id_unique': 'id17122',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id17389',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id20538',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id23559',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id26189',
        'politician_to_add': ['Mary Lou McDonald']
    },
    {
        'id_unique': 'id28862',
        'politician_to_add': ['Kei Starmer']
    },
    {
        'id_unique': 'id29836',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id29875',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id33075',
        'politician_to_add': ['John Swinney']
    },
    {
        'id_unique': 'id33799',
        'politician_to_add': ['Keir Starmer', 'Rishi Sunak']
    },
    {
        'id_unique': 'id36487',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id36610',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id39128',
        'politician_to_add': ['Naomi Long']
    },
    {
        'id_unique': 'id44211',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id44546',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id45134',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id45899',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id46507',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id59144',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id63477',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id63983',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id64186',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id67241',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id70683',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id71254',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id71618',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id72059',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id72097',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id72481',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id73714',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74357',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id74889',
        'politician_to_add': ['Nigel Farage']
    },
    {
        'id_unique': 'id74957',
        'politician_to_add': ['Mary Lou McDonald']
    },
    {
        'id_unique': 'id75081',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75225',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75606',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id75827',
        'politician_to_add': ['John Swinney']
    },
    {
        'id_unique': 'id76039',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id76216',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76346',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id76719',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77046',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77139',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77191',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77550',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id77549',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id77953',
        'politician_to_add': ['Adrian Ramsay']
    },
    {
        'id_unique': 'id78786',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79025',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79264',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79699',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id79824',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79847',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id79949',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80176',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id80478',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80711',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80762',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id80855',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id81118',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id816277',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id82379',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83271',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83354',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id83535',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id83536',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id83687',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id84581',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id84646',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id85744',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id86038',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id87084',
        'politician_to_add': ['George Galloway']
    },
    {
        'id_unique': 'id87345',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id87751',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id88002',
        'politician_to_add': ['Ed Davey']
    },
    {
        'id_unique': 'id88454',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id89545',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id89560',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id89595',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id89923',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id90052',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id90271',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id91370',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id91461',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id91528',
        'politician_to_add': ['Keir Starmer']
    },
    {
        'id_unique': 'id92083',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id92088',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id92712',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id93586',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id93812',
        'politician_to_add': ['Rishi Sunak']
    },
    {
        'id_unique': 'id94040',
        'politician_to_add': ['Carla Denyer', 'Adrian Ramsay']
    }
]

In [20]:
uk_exhaustive_labels_to_add = [{'validation': 'validation1_exhaustive', **item} for item in uk_labels_to_add_exhaustive]
uk_inclusive_labels_to_add = [{'validation': 'validation2_inclusive', **item} for item in uk_labels_to_add_inclusive]

uk_validated = uk_exhaustive_labels_to_add + uk_inclusive_labels_to_add
uk_validation_labels_to_add = pd.DataFrame(uk_validated)
uk_validation_labels_to_add['politician_to_add'] = uk_validation_labels_to_add['politician_to_add'].apply(lambda x: [name.replace(' ', '_') for name in x])

In [21]:
print(f'For the validation of UK, {uk_validation_labels_to_add.shape[0]} labels were found that need to be added to the final dataframe.')
print(f'The first validation step, concerning exhaustiveness: {len(uk_exhaustive_labels_to_add)}.')
print(f'The second validation step, concerning inclusiveness: {len(uk_inclusive_labels_to_add)}.')
print(f'The resulting df is of shape {uk_validation_labels_to_add.shape[0]} and has columns: {uk_validation_labels_to_add.columns.to_list()}.')

For the validation of UK, 112 labels were found that need to be added to the final dataframe.
The first validation step, concerning exhaustiveness: 20.
The second validation step, concerning inclusiveness: 92.
The resulting df is of shape 112 and has columns: ['validation', 'id_unique', 'politician_to_add'].


In [22]:
uk_validation_labels_to_add.to_parquet('datasets/image_validation/UK/UK_validation_incl_exh_labels_to_add.parquet')

Finalize data for both countries

In [23]:
nl_validation_labels_to_add['politician_to_add'] = nl_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))

In [24]:
nl_final_df = nl_articles_img.merge(nl_validation_labels_to_add, on='id_unique', how='left')
nl_final_df['politicians_depicted'] = nl_final_df['politician_in_img'] + nl_final_df['politician_to_add']

In [25]:
nl_final_df[['politicians_depicted', 'politician_in_img', 'politician_to_add']].value_counts()

politicians_depicted                                                                                                      politician_in_img                                                                 politician_to_add                            
(Dilan_Yesilgoz, Geert_Wilders, Caroline_van_der_Plas, Pieter_Omtzigt)                                                    (Dilan_Yesilgoz, Geert_Wilders)                                                   (Caroline_van_der_Plas, Pieter_Omtzigt)          2
(Pieter_Omtzigt, Caroline_van_der_Plas, Dilan_Yesilgoz)                                                                   (Pieter_Omtzigt,)                                                                 (Caroline_van_der_Plas, Dilan_Yesilgoz)          2
(Pieter_Omtzigt, Geert_Wilders)                                                                                           (Pieter_Omtzigt,)                                                                 (Geert_Wilders,)                    

In [26]:
nl_final_df.to_parquet('datasets/NL_final_df_multi_modal.parquet')

In [27]:
uk_validation_labels_to_add['politician_to_add'] = uk_validation_labels_to_add['politician_to_add'].apply(lambda x: tuple(sorted(x)))
uk_final_df = uk_articles_img.merge(uk_validation_labels_to_add, on='id_unique', how='left')
uk_final_df['politicians_depicted'] = uk_final_df['politician_in_img'] + uk_final_df['politician_to_add']

In [None]:
uk_final_df[['politicians_depicted', 'politician_in_img', 'politician_to_add']].value_counts()

politicians_depicted                                                       politician_in_img                                            politician_to_add               
(Keir_Starmer, Rishi_Sunak)                                                (Keir_Starmer,)                                              (Rishi_Sunak,)                      5
(Rishi_Sunak, Keir_Starmer)                                                (Rishi_Sunak,)                                               (Keir_Starmer,)                     5
(Carla_Denyer, Adrian_Ramsay)                                              (Carla_Denyer,)                                              (Adrian_Ramsay,)                    1
(Carla_Denyer, Keir_Starmer, Rhun_ap_Iorwerth, Rishi_Sunak, Nigel_Farage)  (Carla_Denyer, Keir_Starmer, Rhun_ap_Iorwerth, Rishi_Sunak)  (Nigel_Farage,)                     1
(Ed_Davey, John_Swinney, Rishi_Sunak, Keir_Starmer)                        (Ed_Davey, John_Swinney, Rishi_Sunak)                       

In [29]:
uk_final_df.to_parquet('datasets/UK_final_df_multi_modal.parquet')

Get report for both countries