### Get HIBP Data

In [1]:
import pandas as pd
import requests
import os
import time
import logging
from tqdm import tqdm

In [2]:
df = pd.read_csv("data/fl_valid_emails.csv", low_memory = False)

### Take out already processed emails

In [3]:
def extract_email(log_message):
    if "INFO - Saved" in log_message:
        start_index = log_message.find("Saved") + 6  # Length of "Saved" + space
        end_index = log_message.find("\n")
    elif "Error processing" in log_message and ": 404" in log_message:
        start_index = log_message.find("Error processing") + len("Error processing") + 1  # Length of "Error processing" + space
        end_index = log_message.find(": 404") # Length of ": 404" + space
    else:
        return None

    email = log_message[start_index:end_index].strip()
    return email

In [4]:
# Read the log file into a list of lines
with open('pwned_hibp_logs.txt', 'r') as file:
    log_lines = file.readlines()

# Extract emails from log messages and create a list of email addresses
emails = [extract_email(log_line) for log_line in log_lines if "Saved" in log_line or ("Error processing" in log_line and ": 404" in log_line)]
emails_lower = [email.lower() for email in emails]
filtered_list = [item for item in emails_lower if item != '']

In [5]:
df = df[~ df['email'].str.lower().isin(filtered_list)]
df.shape

(0, 8)

In [6]:
# Read HIBP Key
with open('hipb_key.txt', 'r') as file:
    hibp_api_key = file.read()

In [7]:
# HIBP
url = "https://haveibeenpwned.com/api/v3/breachedaccount/<account>"
payload={}
headers = {
  'hibp-api-key': str(hibp_api_key),
  'format': 'application/json',
  'timeout': '2.5',
  'HIBP': str(hibp_api_key),
}

In [8]:
logging.basicConfig(filename='pwned_hibp_logs.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [9]:
def process_row(row):
    email = row['email'].strip()
    url = f"https://haveibeenpwned.com/api/v3/breachedaccount/{email}"
    
    # Check if the file already exists before making the API call and writing to it
    file_path = f"pwned/{email}.json"
    if not os.path.exists(file_path):
        time.sleep(1.5)
        try:
            response = requests.request("GET", url, headers=headers, data=payload)
            response.raise_for_status()  # Check for any HTTP errors
            with open(file_path, "wb") as f:
                f.write(response.content)
            logging.info(f"Saved {email}")
        except requests.exceptions.RequestException as e:
            logging.error(f"Error processing {email}: {e}")

In [10]:
for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing emails'):
    process_row(row)

Processing emails: 0it [00:00, ?it/s]
