### Concat all HIBP and Analyze

In [1]:
import glob
import os
import re
import logging
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

In [2]:
# Directory path containing JSON files
directory = 'pwned/'

logging.basicConfig(filename='error_log_read_json.log', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Get list of JSON file paths in the directory
json_files = glob.glob(directory + '/*.json')

# Define a function for processing a JSON file
def process_json_file(file):
    try:
        df = pd.read_json(file)
        file_name = os.path.basename(file).rstrip('.json')
        df['file_name'] = file_name
        return df
    except ValueError as e:
        logging.error(f"Error reading file: {file}. Error message: {e}")
        return None

# Create a pool of worker processes
pool = Pool()

# Create a progress bar with tqdm
progress_bar = tqdm(total=len(json_files), desc="Processing JSON files", unit="file")

# Process JSON files in parallel
results = []
for df in pool.imap_unordered(process_json_file, json_files):
    results.append(df)
    progress_bar.update(1)

# Close the pool of worker processes
pool.close()
pool.join()

# Close the progress bar
progress_bar.close()

Processing JSON files: 100%|█████████████████████████████████████████████████████████████████████████████████████| 303658/303658 [03:24<00:00, 1485.20file/s]


In [3]:
# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(results, ignore_index=True)

In [4]:
combined_df.rename(columns={'file_name': 'email'}, inplace = True)

In [5]:
breaches = pd.read_json('breaches.json')

In [6]:
breached_email_df = pd.merge(combined_df, breaches, on='Name')
breached_email_df.shape

(2290840, 17)

### Let's create an email level dataset for which we have breaches

#### Non-fab
https://haveibeenpwned.com/FAQs#FabricatedBreach

In [7]:
breached_email_df['non_fab'] = ~breached_email_df['IsFabricated'].fillna(1)

#### Serious Dataclasses Breached 
Either data that will help you hack your other accounts, e.g., Mother's Maiden Name, or sensitive personal information

In [8]:
serious_dataclasses = ["Audio recordings",
"Auth tokens",
"Bank account numbers",
"Biometric data",
"Browsing histories",
"Chat logs",
"Credit card CVV",
"Credit cards",
"Credit status information",
"Drinking habits",
"Driver's licenses",
"Drug habits",
"Email messages",
"Encrypted keys",
"Government issued IDs",
"Health insurance information",
"Historical passwords", #account cracking
"HIV statuses",
#"Mothers maiden names", #account cracking
"Partial credit card data",
"Passport numbers",
"Password hints",
"Passwords",
"Personal health data",
"Photos",
"PINs",
"Places of birth", #account cracking
"Private messages",
"Security questions and answers",
"Sexual fetishes",
"Sexual orientations",
"SMS messages",
"Social security numbers",
"Taxation records"]

In [9]:
breached_email_df['serious'] = breached_email_df['DataClasses'].apply(lambda x: any(string in x for string in serious_dataclasses))

In [10]:
br_email_df = breached_email_df.assign(count=1).\
    groupby('email').\
    agg(total_breaches = ('count', 'sum'),
    serious_breaches   = ('serious', 'sum'),
    non_fab_breaches   = ('non_fab', 'sum'),
    non_null_uniques   = ('Name', 'nunique')
).reset_index()
br_email_df

Unnamed: 0,email,total_breaches,serious_breaches,non_fab_breaches,non_null_uniques
0,000JWA@GMAIL.COM,2,1,2,2
1,000uly@gmail.com,9,7,9,9
2,007@nauticalventures.com,5,2,5,5
3,007WTM@GMAIL.COM,4,2,4,4
4,00FLAB@GMAIL.COM,16,9,16,16
...,...,...,...,...,...
303652,zzgg123@aol.com,8,4,8,8
303653,zzotter@hotmail.com,26,18,26,26
303654,zzouiten@hotmail.com,10,7,10,10
303655,zzzack53@gmail.com,3,0,3,3


### Join to Currently Processed Valid Email List

In [11]:
# Get list of currently processed emails from the logs

def extract_email(log_message):
    if "INFO - Saved" in log_message:
        start_index = log_message.find("Saved") + 6  # Length of "Saved" + space
        end_index = log_message.find("\n")
    elif "Error processing" in log_message and ": 404" in log_message:
        start_index = log_message.find("Error processing") + 18  # Length of "Error processing" + space
        end_index = log_message.find(": 404") # Length of ": 404" + space
    else:
        return None

    email = log_message[start_index:end_index].strip()
    return email

# Read the log file into a list of lines
with open('pwned.log', 'r') as file:
    log_lines = file.readlines()

# Extract emails from log messages and create a list of email addresses
emails = [extract_email(log_line) for log_line in log_lines if "Saved" in log_line or ("Error processing" in log_line and ": 404" in log_line)]

In [12]:
# Subset to non empty uniques
print(len(emails))
print(len(emails[emails == '']))

unique_not_empty_emails = [e for e in set(emails) if e.strip()]
parsed_emails = pd.DataFrame({"email": unique_not_empty_emails})
parsed_emails.shape

381144
23


(363287, 1)

In [13]:
# Merge parsed to valid emails (assuming our population is valid emails)
voter_df = pd.read_csv("fl_emails_valid_or_not.csv")

parsed_voter = parsed_emails.merge(voter_df, how = 'left')
parsed_valid = parsed_voter[parsed_voter['is_valid_email'] == True]
parsed_valid.shape

(308558, 5)

In [14]:
# Merge breaches to parsed
fin_df = pd.merge(parsed_valid, br_email_df, how = 'left', on = 'email')

In [15]:
fin_df.fillna(0).describe()

Unnamed: 0,total_breaches,serious_breaches,non_fab_breaches,non_null_uniques
count,308558.0,308558.0,308558.0,308558.0
mean,7.554077,4.51987,7.554055,7.554077
std,5.8778,4.123446,5.877667,5.8778
min,0.0,0.0,0.0,0.0
25%,3.0,2.0,3.0,3.0
50%,6.0,4.0,6.0,6.0
75%,11.0,7.0,11.0,11.0
max,344.0,298.0,344.0,344.0


In [16]:
# Focus on DNS validated emails
parsed_valid = parsed_voter[parsed_voter['is_valid_email_dns'] == True]
parsed_valid.shape

(307454, 5)

In [17]:
fin_df = pd.merge(parsed_valid, br_email_df, how = 'left', on = 'email')

In [18]:
fin_df.fillna(0).describe()

Unnamed: 0,total_breaches,serious_breaches,non_fab_breaches,non_null_uniques
count,307454.0,307454.0,307454.0,307454.0
mean,7.558064,4.523818,7.558041,7.558064
std,5.87849,4.123963,5.878357,5.87849
min,0.0,0.0,0.0,0.0
25%,3.0,2.0,3.0,3.0
50%,6.0,4.0,6.0,6.0
75%,11.0,7.0,11.0,11.0
max,344.0,298.0,344.0,344.0
