In [2]:
from methods_preprocessing import *

Fetch dataframes with NOS article data, NU article data, and corrected image classification data

In [3]:
nos_articles = pd.read_parquet('datasets/input/nos_articles_election_period_no_sports.parquet')
nu_articles = pd.read_parquet('datasets/input/nu_articles_election_period.parquet')
id_pol_clf = pd.read_parquet('datasets/input/nos_nu_id_politician_grouped-by-article-id.parquet')

In [4]:
print(f'NOS data is of shape {nos_articles.shape} and contains the columns: \n{nos_articles.columns.tolist()}\n\n')
print(f'NU data is of shape {nu_articles.shape} and contains the columns: \n{nu_articles.columns.tolist()}\n\n')
print(f'Classification data is of shape {id_pol_clf.shape} and contains the columns: {id_pol_clf.columns.tolist()}\n')

NOS data is of shape (4707, 12) and contains the columns: 
['id', 'title', 'datetime', 'owner', 'type', 'url', 'date', 'time', 'category', 'images', 'alt_txt', 'paragraphs']


NU data is of shape (5350, 14) and contains the columns: 
['id', 'title', 'datetime', 'url', 'date', 'category', 'tags', 'main_category', 'img_link', 'paragraphs', 'body_paragraphs', 'first_paragraph', 'comments_count', 'media_dict']


Classification data is of shape (273, 3) and contains the columns: ['id', 'dataset', 'true_label']



In [5]:
print(f'NOS dataframe datatypes: \n{nos_articles.dtypes}\n')
print(f'NU dataframe datatypes: \n{nu_articles.dtypes}\n')
print(f'CLF dataframe datatypes: \n{id_pol_clf.dtypes}\n')

NOS dataframe datatypes: 
id                          int64
title                      object
datetime      datetime64[ns, UTC]
owner                      object
type                       object
url                        object
date                       object
time                       object
category                   object
images                     object
alt_txt                    object
paragraphs                 object
dtype: object

NU dataframe datatypes: 
id                               int64
title                           object
datetime           datetime64[ns, UTC]
url                             object
date                            object
category                        object
tags                            object
main_category                   object
img_link                        object
paragraphs                      object
body_paragraphs                 object
first_paragraph                 object
comments_count                   int64
media_dict         

### Prepare and merge df's for preprocessing

In [6]:
# NOS
nos_articles['dataset'] = 'NOS'
nos_articles['tags'] = [[] for _ in range(len(nos_articles))]
nos_articles['comments_count'] = 0
nos_articles = nos_articles.rename(columns={'images': 'img_link'})

# NU
nu_articles['dataset'] = 'NU'
nu_articles['alt_txt'] = ''
nu_articles['img_link'] = nu_articles['img_link'].apply(lambda x: [x])

# CLF
id_pol_clf['id'] = id_pol_clf['id'].astype(int)
id_pol_clf = id_pol_clf.rename(columns={'true_label': 'politician_in_img'})

In [7]:
nos_df = nos_articles[['dataset', 'id', 'url', 'img_link', 'alt_txt', 'datetime', 'date', 'category', 'tags', 'comments_count', 'title', 'paragraphs']]
nu_df = nu_articles[['dataset', 'id', 'url', 'img_link', 'alt_txt', 'datetime', 'date', 'category', 'tags', 'comments_count', 'title', 'paragraphs']]

In [8]:
nos_nu_df = pd.concat([nos_df, nu_df], ignore_index=True)
nos_nu_clf = pd.merge(nos_nu_df, id_pol_clf, on=['id', 'dataset'], how='left')

In [9]:
nos_nu_clf = nos_nu_clf.sort_values(by=['dataset', 'id'], ascending=[True, True])

## Pre-processing

In [10]:
nos_nu_clf['idx'] = range(0, len(nos_nu_clf))
nos_nu_clf = nos_nu_clf.set_index(['idx'])

In [11]:
from wordlists_preprocessing import political_words
from methods_preprocessing import list_occurrences_triple_column

columns_of_interest = ['title', 'paragraphs', 'alt_txt']
processed_articles = list_occurrences_triple_column(nos_nu_clf, columns_of_interest, political_words)

### Customized dataset correction for Dutch media dataset-specific analysis purposes
1. Remove mistaken political terms </br>
        - Remove 'dijk' (lowercase) if occurring in ['SP_title, SP_text'] / Politician: Jimmy Dijk </br>
        - Remove 'denk' (lowercase) if occurring in ['Denk_title', 'Denk_text'] / Political party: Denk </br>
        - Remove 'volt' (lowercase) if occurrent in ['Volt_title, 'Volt_text'] / Political party: Volt </br>
        - Remove 'forum' 
2. Create consistency in GL-PvdA references
3. Remove 'first_name last_name' politician multistrings, leave only 'last_name' 

In [12]:
df = processed_articles.copy()

In [13]:
# 1 Remove mistaken political terms (case-sensitive)
from methods_preprocessing import remove_words

political_words_to_remove = {'denk', 'volt', 'dijk', 'forum'}
for col in ['DENK_title', 'DENK_paragraphs', 'DENK_alt_txt', 'Volt_title', 'Volt_paragraphs', 'Volt_alt_txt', 'SP_title', 'SP_paragraphs', 'SP_alt_txt', 'FVD_title', 'FVD_paragraphs', 'FVD_alt_txt']:
    df[col] = df[col].apply(lambda x: remove_words(x, political_words_to_remove))

In [14]:
# 2 Create consistency in GL-PvdA references
from methods_preprocessing import conditional_remove_and_map_words
from wordlists_preprocessing import glpvda_search_list, glpvda_mapping

for col in ['GL-PvdA_title', 'GL-PvdA_paragraphs', 'GL-PvdA_alt_txt']:
    df[col] = df[col].apply(lambda x: conditional_remove_and_map_words(x, glpvda_search_list, glpvda_mapping))

In [15]:
# 3 Remove 'first_name lastname' multistrings, leave only last names
from methods_preprocessing import remove_words_from_colnames
from wordlists_preprocessing import political_party_colnames, politician_full_names_to_remove

full_df, full_names_only_df = remove_words_from_colnames(df, political_party_colnames, politician_full_names_to_remove)

### Export pre-processed dataframe to Excel for last check

In [23]:
# Convert all datetime columns to facilitate manual inspection
for col in full_df.select_dtypes(include=['datetime64[ns, UTC]']).columns:
    full_df[col] = full_df[col].dt.tz_localize(None)

# Now write the DataFrame to Excel - not yet corrected!
#full_df.to_excel('datasets/nos_nu_clf_preprocessed.xlsx')

### News images that need addiitonal check:
1. News images that were assigned politician(s) classification(s) must be checked on whether the classifications are inclusive and exhausutive, i.e. all (!) politicians in the image are in the classification list
2. For all news images where *no* politicicans were detected and matched in the image, but the political party and/or party leader appears in the title or alt_txt, an additional check is needed. There exists a plausible chance that a politician was missed by the face detectors, but still appears in the news image

In [24]:
full_df['img_link'] = full_df['img_link'].astype(str).str.strip("[]")
full_df['img_link'] = full_df['img_link'].str.replace("'", "")

In [25]:
# 1 Subset the articles with politicians in the assigned classifications
clf_pol_only = full_df[~full_df['politician_in_img'].isna()].copy()
'''
clf_download_folder = 'datasets/images/clf_images_to_check'
download_img_from_url(clf_download_folder, clf_pol_only, 'id', 'politician_in_img', 'img_link')'''

"\nclf_download_folder = 'datasets/images/clf_images_to_check'\ndownload_img_from_url(clf_download_folder, clf_pol_only, 'id', 'politician_in_img', 'img_link')"

In [26]:
# 2 Get articles with no classified politicians in image, but appearing in title or alt_txt

In [35]:
columns_with_title_or_alt_txt = [
    col for col in df.columns
    if ('_title' in col or '_alt_txt' in col) and
       not ('Positions' in col or 'Politics' in col or 'Issues' in col or 'Countries' in col)
]

In [36]:
from methods_preprocessing import is_non_empty_list
no_clf = full_df[full_df['politician_in_img'].isna()]

non_empty_lists_condition = no_clf[columns_with_title_or_alt_txt].map(is_non_empty_list)
rows_with_non_empty_lists = non_empty_lists_condition.any(axis=1)

no_clf = no_clf[rows_with_non_empty_lists]

In [38]:
print(f'{no_clf.shape[0]} main images that do not (yet) have a politician labelled, but do contain a party leader or party reference in the title or alt-text, need an extra check.')

313 main images that do not (yet) have a politician labelled, but do contain a party leader or party reference in the title or alt-text, need an extra check.


In [39]:
'''
no_clf_download_folder = 'datasets/images/no_clf_images_to_check'
download_img_from_url(no_clf_download_folder, no_clf, 'id', 'politician_in_img', 'img_link')'''

"\nno_clf_download_folder = 'datasets/images/no_clf_images_to_check'\ndownload_img_from_url(no_clf_download_folder, no_clf, 'id', 'politician_in_img', 'img_link')"

### Now follows a last check by scrolling through the /images/ folders to see if
1. For all images in the *clf_images_to_check* folder, the assigned classifications are inclusive and exhaustiive
2. For all images in the *no_clf_to_check* folder, there appear images that contain one or more politicians whose faces were not detected and therefore did not appear in the classification set