In [2]:
import requests
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

def check_url(base_url, code):
    url = f"{base_url}{code}/"
    try:
        response = requests.get(url, allow_redirects=True)
        if response.status_code == 404:
            return np.nan  # Return NaN for 404 errors
        elif response.url != url:
            new_code = response.url.split('/')[-2]  # Extract the code from the redirected URL
            return int(new_code)  # Return the new code if redirected
        else:
            return code  # Return the original code if found and not redirected
    except requests.RequestException as e:
        print(f"Error checking URL {url}: {e}")
        return None  # Return None in case of a request exception

In [3]:
# Define the base URL and the range of codes
base_url = "https://www.openelections.co.uk/leaflets/"
start_range = 2770
end_range = 40000

# Load existing data if available
try:
    existing_df = pd.read_csv('valid_urls.csv')
    last_checked = existing_df['Number'].max()
except (FileNotFoundError, pd.errors.EmptyDataError):
    existing_df = pd.DataFrame(columns=['Number', 'URL'])
    last_checked = start_range - 1
    
# Iterate over the range and collect valid URLs
valid_urls = existing_df.values.tolist()
valid_count = len(valid_urls)
backup_interval = 100

# Create a tqdm iterator and store it in a variable
pbar = tqdm(range(last_checked + 1, end_range + 1), desc="Checking URLs", total=end_range - last_checked)

for code in pbar:
    result = check_url(base_url, code)
    if result is not np.nan and result is not None:
        valid_url = f"{base_url}{result}/"
        valid_urls.append([result, valid_url])
        valid_count += 1

    # Backup data every 100 URLs
    if code % backup_interval == 0:
        df_backup = pd.DataFrame(valid_urls, columns=['Number', 'URL'])
        df_backup.to_csv('valid_urls.csv', index=False)

    # Update tqdm bar with the number of valid URLs found
    pbar.set_description(f"Checking URLs (Valid: {valid_count})")

# Save the final results
df_final = pd.DataFrame(valid_urls, columns=['Number', 'URL'])
df_final.to_csv('valid_urls.csv', index=False)

# Display the first few rows of the DataFrame
df_final.head()

Checking URLs:   0%|          | 0/900 [00:00<?, ?it/s]

Unnamed: 0,Number,URL
0,2770,https://www.openelections.co.uk/leaflets/2770/
1,27711,https://www.openelections.co.uk/leaflets/27711/
2,27720,https://www.openelections.co.uk/leaflets/27720/
3,27731,https://www.openelections.co.uk/leaflets/27731/
4,2774,https://www.openelections.co.uk/leaflets/2774/


In [4]:
len(df_final)

9614