# Imports

In [10]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import concurrent.futures
import numpy as np
import threading

# Get Links

In [12]:
# Load the CSV file
file_path = './data/combined_tabular_data.csv'
checkpoint_path = './data/checkpoint_data.csv'

write_lock = threading.Lock()

if os.path.exists(checkpoint_path):
    file = pd.read_csv(checkpoint_path, low_memory=False)
else:
    file = pd.read_csv(file_path, low_memory=False)
    file.to_csv(checkpoint_path)
    file['image_url'] = None

def scrape_page(data_chunk):
    options = Options()
    options.add_argument('--headless=new')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    for index, row in data_chunk.iterrows():
        if pd.notna(row['image_url']):
            continue
        
        page_url = row['link']
        driver.get(page_url)

        try:
            image_tag = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'img.leaflet-litter-img'))
            )
            image_url = image_tag.get_attribute('src')
            data_chunk.at[index, 'image_url'] = image_url
            print(f'Found image URL: {image_url}')

            with write_lock:
                temp_file = pd.read_csv(checkpoint_path, low_memory=False)
                temp_file.loc[temp_file['id'] == row['id'], 'image_url'] = image_url
                temp_file.to_csv(checkpoint_path, index=False)

        except Exception as e:
            print(f'No image found on page {page_url} or failed to access the page due to: {e}')

    driver.quit()

    return data_chunk

# Split DataFrame into chunks
num_chunks = 3  # Adjust this number based on your system's capabilities
chunks = np.array_split(file, num_chunks)

# Process chunks in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_page, chunks))

# Combine results into a single DataFrame
file = pd.concat(results)

# Save the final DataFrame to a file
file.to_excel('./final_output_with_images.xlsx', index=False)


Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2020/12/11/42320550-0C8E-4C27-9EA7-60119399C1EA.jpg
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/06/24/IMG_2014.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/03/20/IMG_5925.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/01/09/IMG_0679.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/06/24/IMG_2013.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/03/20/IMG_5923.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/04/17/IMG_0780.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/06/24/IMG_2012.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/03/20/IMG_5920.HEIC
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/04/24/785C55E0-228A-4E79-85CC-D616DEBE8771.jpg
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/06/24/IMG_2011.HEIC
Found image URL: https