### Get HIBP Data 

1. For everypol
2. For all breaches
3. For all bihar pols

In [1]:
import os
import json
import re
import requests
import time
import logging
import pandas as pd

In [2]:
logging.basicConfig(
    filename="hibp_errors.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [3]:
with open("hibp_key", "r") as key_file:
    hibp_api_key = key_file.read().strip()

url = "https://haveibeenpwned.com/api/v3/breachedaccount/<account>"
payload = {}
headers = {
    'hibp-api-key': hibp_api_key,
    'format': 'application/json',
    'timeout': '2.5',
    'HIBP': hibp_api_key,
    'user-agent': 'PythonScript'
}

In [4]:
# Rate limit
rate_limit_interval = 6
last_request_time = time.time()

In [5]:
def process_breached_accounts(df, output_folder, headers, payload, rate_limit_interval):
    """
    Process emails from a DataFrame by querying the HIBP API for breached accounts.

    Args:
        df (pd.DataFrame): DataFrame containing an 'email' column.
        output_folder (str): Path to the folder for saving JSON results.
        headers (dict): Headers for the API request.
        payload (dict): Additional payload for the API request.
        rate_limit_interval (float): Minimum time interval between API requests (in seconds).

    Returns:
        None
    """
    last_request_time = time.time() - rate_limit_interval  # Allow the first request immediately

    for index, row in df.iterrows():
        email = row['email']
        url = f"https://haveibeenpwned.com/api/v3/breachedaccount/{email}"
        file_path = os.path.join(output_folder, f"{email}.json")

        if os.path.exists(file_path):
            logging.info(f"Skipping {email} (already processed)")
            continue

        while True:
            now = time.time()
            elapsed_time = now - last_request_time
            if elapsed_time < rate_limit_interval:
                time.sleep(rate_limit_interval - elapsed_time)

            try:
                response = requests.get(url, headers=headers, data=payload)
                last_request_time = time.time()

                if response.status_code == 404:
                    logging.info(f"No breach found for {email} (404).")
                    break

                if response.status_code == 429:
                    retry_after = int(response.headers.get("Retry-After", 1))
                    logging.warning(f"Rate limit hit. Retrying after {retry_after} seconds.")
                    time.sleep(retry_after + 2)
                    continue

                response.raise_for_status()

                with open(file_path, "wb") as f:
                    f.write(response.content)
                logging.info(f"Processed {email} (index {index})")
                break

            except requests.exceptions.RequestException as e:
                logging.error(
                    f"Error for {email} (index {index}): {e}, "
                    f"Status Code: {response.status_code if 'response' in locals() else 'N/A'}"
                )
                break

In [None]:
df = pd.read_csv("../data/everypol_unique_emails.csv")
df.head()

In [None]:
output_folder = "../data/everypol_hibp/"
os.makedirs(output_folder, exist_ok=True)
process_breached_accounts(df, output_folder, headers, payload, rate_limit_interval)

### Get all the breaches

In [None]:
breach_url = "https://haveibeenpwned.com/api/v3/breaches"

response = requests.get(breach_url, headers=headers)
breaches_data = response.json()

In [None]:
breaches_df = pd.DataFrame(breaches_data)
breaches_df.head()

In [None]:
breaches_df.to_csv("../data/hipb_01_2025_breaches_data.csv", index=False)

### Get data for Bihar

From: https://vidhansabha.bih.nic.in/KnowyourMLA%20in%20Hindi.html

In [6]:
with open('../data/bihar/bihar.txt', 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()
    lines = [line.split('\t') for line in lines]

bihar_df = pd.DataFrame(lines, columns=['Sr. No.', 'Photo', 'Constituency', 'Name', 'Gender', 'Party', 'Contact', 'Email'])

bihar_df.columns = bihar_df.columns.str.lower()

bihar_df[['email']]

Unnamed: 0,email
0,mla-vnagar-bih@nic.in
1,mla-ramnagar-bih@nic.in
2,mla-nktganj-bih@nic.in
3,mla-bagaha-bih@nic.in
4,mla-lauriya-bih@nic.in
...,...
238,mla-wrsaliganj-bih@nic.in
239,mla-sikandara-bih@nic.in
240,mla-jamui-bih@nic.in
241,mla-jhajha-bih@nic.in


In [None]:
output_folder = "../data/bihar_hibp/"
os.makedirs(output_folder, exist_ok=True)
process_breached_accounts(bihar_df[['email']], output_folder, headers, payload, rate_limit_interval)