#### Necessary imports

In [1]:
from methods_data_collection import *

In [2]:
tqdm.pandas()

# Visual data collection
This notebook contains the data collection for the following datasets:
1. Train/test data required for building a facial recognition model for politicians
2. News images data: </br>
   a. NOS.nl </br>
   b. NU.nl </br>

## 1. Train/test data required for building a facial recognition system
##### Set up data infrastructure
Load references obtained in script #2 of this directory, create a test set

In [3]:
# Read the collected training set image links
bing_img = pd.read_excel('datasets/bing_img_links.xlsx')
ddg_img = pd.read_excel('datasets/ddg_img_links.xlsx')
ecosia_img = pd.read_excel('datasets/ecosia_img_links.xlsx')

# Ensure that the 'restricted access' error arising with some ecosia links is prevented
ecosia_img['img_link'] = ecosia_img['img_link'].apply(lambda x: x.replace('explicit', 'mm'))

# Concatenate all training set image links into one file
img_links_bing_ddg = pd.concat([bing_img, ddg_img])
img_links = pd.concat([img_links_bing_ddg, ecosia_img])
img_links = img_links[['politician', 'img_link']]

In [4]:
img_links.to_excel('datasets/all_train_img_links.xlsx')

In [5]:
print(img_links.shape)
print(img_links.columns)

(6375, 2)
Index(['politician', 'img_link'], dtype='object')


In [6]:
test_set = img_links.sample(n = 50, random_state = 0)

In [7]:
test_set.to_excel('datasets/test_set_img_links.xlsx')

### Open images online, apply face detection and store isolated faces to file

#### Test if methods work as expected, including print statements

In [8]:
from methods_data_collection import detect_cut_save_faces_test

# Create the output directory
output_dir_traintest_test = 'datasets/images/isolated_train_faces_scraped_test'
os.makedirs(output_dir_traintest_test, exist_ok=True)

detect_cut_save_faces_test(test_set, 'img_link', 'politician', output_dir_traintest_test)

Test scraping at...:   0%|          | 0/50 [00:00<?, ?it/s]

Laurens Dassen with image url: https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse3.mm.bing.net%2Fth%3Fid%3DOIP.0OMU0vd2yrf-r_zxLsQ3SQHaFB%26pid%3DApi&f=1&ipt=2011936aaf234ece1724886e8828f5e80ebb8b7fc9acd9664579d44a7a1b7c5a&ipo=images
#HAAR: 1 at: [[250  70 126 126]]
#FR: 1 at: [(259, 92, 107, 107)]
#COMBINED: 2
#UNIQUE: 1 at: [array([250,  70, 126, 126], dtype=int32)]

Lilian Marijnissen with image url: https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%3Fid%3DOIP.Yo4LgZO6FffHRksLETWEGQHaE7%26pid%3DApi&f=1&ipt=246a912344e1bdb873b06ee2e3e054b88ff0e33cdc42feb9ec4b58309d456795&ipo=images
#HAAR: 1 at: [[257  67 119 119]]
#FR: 1 at: [(259, 80, 107, 107)]
#COMBINED: 2
#UNIQUE: 1 at: [array([257,  67, 119, 119], dtype=int32)]

Pieter Omtzigt with image url: https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse4.mm.bing.net%2Fth%3Fid%3DOIP.J4r3CPGdywulbxlqwYlekAHaE7%26pid%3DApi&f=1&ipt=29275d7c034d8dc47acfa0c82f686a304f5c484316b2fa9b3985b84

### Continue to full dataset after inspection

In [9]:
from methods_data_collection import detect_cut_save_faces

# Create the output directory
output_dir_traintest = 'datasets/images/isolated_train_faces_scraped_uncorrected'
os.makedirs(output_dir_traintest, exist_ok=True)

detect_cut_save_faces(img_links, 'img_link', 'politician', output_dir_traintest)

Scraping at...:   0%|          | 0/6375 [00:00<?, ?it/s]

## 2. News images data
### a. NOS.nl

##### Set up data infrastructure
Load articles that are not exclusively about sports obtained in script #1 of this directory, create a test set

In [3]:
nos_articles = pd.read_parquet('datasets/nos_articles_election_period_no_sports.parquet')
nos_articles['img_link'] = nos_articles['images'].astype(str).apply(lambda url: url.strip("['']"))

In [4]:
print(nos_articles.shape)
print(nos_articles.columns)

(4707, 13)
Index(['id', 'title', 'datetime', 'owner', 'type', 'url', 'date', 'time',
       'category', 'images', 'alt_txt', 'paragraphs', 'img_link'],
      dtype='object')


In [6]:
nos_articles_test = nos_articles.sample(n = 50, random_state = 0)

#### Test if methods work as expected, including print statements

In [7]:
from methods_data_collection import detect_cut_save_faces_test

# Create the output directory
output_dir_news_nos_test = 'datasets/images/isolated_news_faces_nos_test'
os.makedirs(output_dir_news_nos_test, exist_ok=True)

detect_cut_save_faces_test(nos_articles_test, 'img_link', 'id', output_dir_news_nos_test)

Test scraping at...:   0%|          | 0/50 [00:00<?, ?it/s]

2496413 with image url: https://cdn.nos.nl/image/2023/11/03/1022523/1024x576a.jpg
#HAAR: 0 at: ()
#FR: 0 at: []
#COMBINED: 0
#UNIQUE: 0 at: []

2491905 with image url: https://cdn.nos.nl/image/2023/09/26/1011627/1024x576a.jpg
#HAAR: 2 at: [[806 265  53  53]
 [376  81 379 379]]
#FR: 1 at: [(419, 134, 321, 321)]
#COMBINED: 3
#UNIQUE: 2 at: [array([806, 265,  53,  53], dtype=int32), array([376,  81, 379, 379], dtype=int32)]

2493815 with image url: https://cdn.nos.nl/image/2023/10/12/1016208/1024x576a.jpg
#HAAR: 1 at: [[904  54  53  53]]
#FR: 0 at: []
#COMBINED: 1
#UNIQUE: 1 at: [array([904,  54,  53,  53], dtype=int32)]

2496602 with image url: https://cdn.nos.nl/image/2023/11/04/1022975/1024x576a.jpg
#HAAR: 2 at: [[621 191  38  38]
 [868 266  54  54]]
#FR: 3 at: [(366, 98, 185, 186), (621, 194, 36, 36), (871, 271, 43, 43)]
#COMBINED: 5
#UNIQUE: 3 at: [array([621, 191,  38,  38], dtype=int32), array([868, 266,  54,  54], dtype=int32), (366, 98, 185, 186)]

2496801 with image url: https:/

### Continue to full dataset after inspection

In [5]:
from methods_data_collection import detect_cut_save_faces

# Create the output directory
output_dir_news_nos = 'datasets/images/isolated_news_faces_nos'
os.makedirs(output_dir_news_nos, exist_ok=True)

detect_cut_save_faces(nos_articles, 'img_link', 'id', output_dir_news_nos)

Scraping at...:   0%|          | 0/3592 [00:00<?, ?it/s]

Image at https://cdn.nos.nl/image/2023/11/30/1030850/1024x576a.jpg does not have 3 channels.
Image at https://cdn.nos.nl/image/2023/11/18/1027161/1024x576a.jpg does not have 3 channels.
Image at https://cdn.nos.nl/image/2023/11/15/1026198/1024x576a.jpg does not have 3 channels.
Image at https://cdn.nos.nl/image/2023/10/09/1015417/1024x576a.jpg does not have 3 channels.
Image at https://cdn.nos.nl/image/2023/10/04/1013835/1024x576a.jpg does not have 3 channels.


### b. NU.nl

##### Set up data infrastructure
Load articles that are not exclusively about sports obtained in script #1 of this directory, create a test set

In [8]:
nu_articles = pd.read_parquet('datasets/nu_articles_election_period.parquet')

In [9]:
print(nu_articles.shape)
print(nu_articles.columns)

(5350, 13)
Index(['id', 'title', 'datetime', 'url', 'date', 'category', 'tags',
       'main_category', 'img_link', 'paragraphs', 'first_paragraph',
       'comments_count', 'media_dict'],
      dtype='object')


In [10]:
nu_articles_test = nu_articles.sample(n = 50, random_state = 0)

#### Test if methods work as expected, including print statements

In [11]:
from methods_data_collection import detect_cut_save_faces_test

# Create the output directory
output_dir_news_nu_test = 'datasets/images/isolated_news_faces_nu_test'
os.makedirs(output_dir_news_nu_test, exist_ok=True)

detect_cut_save_faces_test(nu_articles_test, 'img_link', 'id', output_dir_news_nu_test)

Test scraping at...:   0%|          | 0/50 [00:00<?, ?it/s]

6287268 with image url: https://media.nu.nl/m/44axqdaabi9p_wd854
#HAAR: 1 at: [[380  41 114 114]]
#FR: 1 at: [(394, 56, 90, 90)]
#COMBINED: 2
#UNIQUE: 1 at: [array([380,  41, 114, 114], dtype=int32)]

6290441 with image url: https://media.nu.nl/m/gzgxq6caxxfx_wd854
#HAAR: 2 at: [[279 369  57  57]
 [526 151 168 168]]
#FR: 1 at: [(510, 139, 186, 186)]
#COMBINED: 3
#UNIQUE: 2 at: [array([279, 369,  57,  57], dtype=int32), array([526, 151, 168, 168], dtype=int32)]

6292477 with image url: https://media.nu.nl/m/crvxyuma7ibd_wd854
#HAAR: 0 at: ()
#FR: 0 at: []
#COMBINED: 0
#UNIQUE: 0 at: []

6279734 with image url: https://media.nu.nl/m/omixlhna3zd0_wd854
#HAAR: 1 at: [[271  52 207 207]]
#FR: 1 at: [(283, 77, 186, 186)]
#COMBINED: 2
#UNIQUE: 1 at: [array([271,  52, 207, 207], dtype=int32)]

6285152 with image url: https://media.nu.nl/m/t44xubxaya8d_wd854
#HAAR: 0 at: ()
#FR: 0 at: []
#COMBINED: 0
#UNIQUE: 0 at: []

6284739 with image url: https://media.nu.nl/m/9mrxcvtapcyb_wd854
#HAAR: 0 at:

### Continue to full dataset after inspection

In [12]:
from methods_data_collection import detect_cut_save_faces

# Create the output directory
output_dir_news_nu = 'datasets/images/isolated_news_faces_nu'
os.makedirs(output_dir_news_nu, exist_ok=True)

detect_cut_save_faces(nu_articles, 'img_link', 'id', output_dir_news_nu)

Scraping at...:   0%|          | 0/5350 [00:00<?, ?it/s]

Image at https://media.nu.nl/m/434xfyia0vjl_wd854 does not have 3 channels.
Image at https://media.nu.nl/m/vgsxgvgau0zu_wd854 does not have 3 channels.
Image at https://media.nu.nl/m/er7x411ajrw9_wd854 does not have 3 channels.
Image at https://media.nu.nl/m/vgsxgvgau0zu_wd854 does not have 3 channels.
Image at https://media.nu.nl/m/o9bxn7pahiak_wd854 does not have 3 channels.
Image at https://media.nu.nl/m/h4vxdwaasvdf_wd854 does not have 3 channels.
