### Get HIBP data for everypol (Including metadata on breaches) + Merge

In [None]:
import os
import json
import re
import requests
import time
import logging
import pandas as pd

In [None]:
logging.basicConfig(
    filename="hibp_errors.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [None]:
df = pd.read_csv("../data/everypol_unique_emails.csv")
df.head()

In [None]:
with open("hibp_key", "r") as key_file:
    hibp_api_key = key_file.read().strip()

url = "https://haveibeenpwned.com/api/v3/breachedaccount/<account>"
payload = {}
headers = {
    'hibp-api-key': hibp_api_key,
    'format': 'application/json',
    'timeout': '2.5',
    'HIBP': hibp_api_key,
}

In [None]:
# Rate limit
rate_limit_interval = 6
last_request_time = time.time()

In [None]:
output_folder = "../data/everypol_hibp/"
os.makedirs(output_folder, exist_ok=True)

In [None]:
for index, row in df.iterrows():
    email = row['email']
    url = f"https://haveibeenpwned.com/api/v3/breachedaccount/{email}"
    file_path = os.path.join(output_folder, f"{email}.json")

    if os.path.exists(file_path):
        logging.info(f"Skipping {email} (already processed)")
        continue

    while True:
        now = time.time()
        elapsed_time = now - last_request_time
        if elapsed_time < rate_limit_interval:
            time.sleep(rate_limit_interval - elapsed_time)

        try:
            response = requests.get(url, headers=headers, data=payload)
            last_request_time = time.time()

            if response.status_code == 404:
                logging.info(f"No breach found for {email} (404).")
                break

            if response.status_code == 429:
                retry_after = int(response.headers.get("Retry-After", 1))
                logging.warning(f"Rate limit hit. Retrying after {retry_after} seconds.")
                time.sleep(retry_after + 2)
                continue

            response.raise_for_status()

            with open(file_path, "wb") as f:
                f.write(response.content)
            logging.info(f"Processed {email} (index {index})")
            break

        except requests.exceptions.RequestException as e:
            logging.error(
                f"Error for {email} (index {index}): {e}, Status Code: {response.status_code if 'response' in locals() else 'N/A'}"
            )
            break

### Get all the breaches

In [None]:
url = "https://haveibeenpwned.com/api/v3/breaches"
payload = {}
headers = {
    'hibp-api-key': hibp_api_key,
    'format': 'application/json',
    'timeout': '2.5',
    'HIBP': hibp_api_key,
    "user-agent": "PythonScript"
}

response = requests.get(url, headers=headers)
breaches_data = response.json()

In [None]:
breaches_df = pd.DataFrame(breaches_data)

breaches_df.head()

In [None]:
breaches_df.to_csv("../data/hipb_01_2025_breaches_data.csv", index=False)