In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
from tqdm.notebook import tqdm
import requests
import os

def get_specific_assets(url, xpath_image, xpath_details):
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(2)  # Wait for the page to load

    assets = {}

    # Fetch the image element using XPath
    try:
        image_element = driver.find_element(By.XPATH, xpath_image)
        original_image_url = image_element.get_attribute('data-zoom')  # Extract the 'data-zoom' attribute
        assets['image_urls'] = [original_image_url]

        # Check for additional image URLs
        base_url, number = original_image_url.rsplit('_', 1)
        number = number.split('-')[0]
        i = 2
        while True:
            new_url = f"{base_url}_{i}" + original_image_url[original_image_url.rfind('-'):]
            print(new_url)
            if check_url_validity(new_url):
                assets['image_urls'].append(new_url)
                i += 1
            else:
                break
    except Exception as e:
        print(f"Error finding image element: {e}")
        assets['image_urls'] = []
    # Fetch the details element using XPath
    try:
        details_element = driver.find_element(By.XPATH, xpath_details)
        dt_elements = details_element.find_elements(By.TAG_NAME, 'dt')
        dd_elements = details_element.find_elements(By.TAG_NAME, 'dd')

        for dt, dd in zip(dt_elements, dd_elements):
            key = dt.text.strip()
            value = dd.text.strip()
            assets[key] = value
    except Exception as e:
        print(f"Error finding details element: {e}")

    driver.quit()
    return assets

# Function to save DataFrame to CSV
def save_to_csv(df, filename):
    df.to_csv(filename, index=False)


def check_url_validity(url):
    """Check if a URL is valid and accessible."""
    try:
        response = requests.head(url)
        return response.status_code == 200
    except requests.RequestException:
        return False

def download_image(image_url, folder_path, max_retries=3, timeout=20):
    """Download an image from a URL and save it to a specified folder with retry mechanism."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    image_name = image_url.split('/')[-1]
    image_path = os.path.join(folder_path, image_name)

    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(image_url)
            if response.status_code == 200:
                with open(image_path, 'wb') as file:
                    file.write(response.content)
                return
        except requests.RequestException as e:
            print(f"Error downloading image {image_url}: {e}")
            time.sleep(timeout)
            retries += 1

    print(f"Failed to download image after {max_retries} retries: {image_url}")


In [2]:
# Iterate over the URLs in the DataFrame
# Initialize an empty DataFrame or read existing data
try:
    leaflet_data = pd.read_csv('leaflet_data.csv')
except FileNotFoundError:
    leaflet_data = pd.DataFrame(columns=['URL', 'image_urls', 'local_images', 'Election:', 'Party:', 'Constituency:', 'Mentions:', 'Issues Covered:'])

# Read the CSV file with URLs
df = pd.read_csv('valid_urls.csv')

# XPath of the specific elements
xpath_image = '/html/body/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[2]/div'
xpath_details = '/html/body/div[3]/div[2]/div/div[1]/div[2]/dl'


for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    url = row['URL']
    leaflet_number = row['Number']

    if url not in leaflet_data['URL'].values:
        specific_assets = get_specific_assets(url, xpath_image, xpath_details)
        specific_assets['URL'] = url

        # Download images and update image_urls with local paths
        image_folder = f"leaflet_images/{leaflet_number}"
        local_image_urls = []
        for image_url in specific_assets.get('image_urls', []):
            download_image(image_url, image_folder)
            local_image_name = image_url.split('/')[-1]
            local_image_urls.append(os.path.join(image_folder, local_image_name))
        specific_assets['local_images'] = local_image_urls

        # Append the new row to the DataFrame
        new_row_df = pd.DataFrame([specific_assets])
        leaflet_data = pd.concat([leaflet_data, new_row_df], ignore_index=True)
        
        # Save every 100 URLs
        if index % 100 == 0:
            save_to_csv(leaflet_data, 'leaflet_data.csv')

# Save the final DataFrame
save_to_csv(leaflet_data, 'leaflet_data.csv')

  0%|          | 0/9614 [00:00<?, ?it/s]

https://cdn.openelections.co.uk/uploads/2021/10/18627_2-726x1024.png
https://cdn.openelections.co.uk/uploads/2021/10/18627_3-726x1024.png
https://cdn.openelections.co.uk/uploads/2021/10/18628_2-726x1024.png
https://cdn.openelections.co.uk/uploads/2021/10/18629_2-726x1024.png
https://cdn.openelections.co.uk/uploads/2021/10/18630_2-716x1024.png
https://cdn.openelections.co.uk/uploads/2021/10/18630_3-716x1024.png
