### Concat all HIBP and Analyze

In [3]:
import glob
import os
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

In [5]:
# Directory path containing JSON files
directory = 'pwned/'

# Get list of JSON file paths in the directory
json_files = glob.glob(directory + '/*.json')

# Define a function for processing a JSON file
def process_json_file(file):
    df = pd.read_json(file)
    file_name = os.path.basename(file).rstrip('.json')
    df['file_name'] = file_name
    return df

# Create a pool of worker processes
pool = Pool()

# Create a progress bar with tqdm
progress_bar = tqdm(total=len(json_files), desc="Processing JSON files", unit="file")

# Process JSON files in parallel
results = []
for df in pool.imap_unordered(process_json_file, json_files):
    results.append(df)
    progress_bar.update(1)

# Close the pool of worker processes
pool.close()
pool.join()

# Close the progress bar
progress_bar.close()

Processing JSON files: 100%|██████████████████████████████████████████████████████████████████████████████████████| 114169/114169 [03:07<00:00, 607.68file/s]


In [9]:
# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(results, ignore_index=True)

In [10]:
combined_df.rename(columns={'file_name': 'email'}, inplace = True)

In [11]:
breaches = pd.read_json('breaches.json')

In [12]:
fin_df = pd.merge(combined_df, breaches, on='Name')
fin_df.shape

(857753, 17)

In [13]:
fin_df.groupby('email').size()

email
000JWA@GMAIL.COM            2
000uly@gmail.com            9
007WTM@GMAIL.COM            4
00NMGP@GMAIL.COM            4
00briangoss00@gmail.com     1
                           ..
zykovia@gmail.com          15
zymoniedwards@yahoo.com     8
zyuranger@yahoo.com        23
zz840_90@msn.com           15
zzzack53@gmail.com          3
Length: 114169, dtype: int64

In [14]:
fin_df.groupby(['email'])['Name'].nunique().reset_index()['Name'].describe()

count    114169.000000
mean          7.513011
std           5.773500
min           1.000000
25%           3.000000
50%           6.000000
75%          11.000000
max         344.000000
Name: Name, dtype: float64

In [15]:
(fin_df.groupby(['email'])['Name'].nunique().reset_index()['Name'] > 0).mean()

1.0

In [16]:
fin_df['Name'].isna().sum()

0

#### How many non-fabricated breaches?

https://haveibeenpwned.com/FAQs#FabricatedBreach

In [17]:
fin_df['non_fab'] = ~fin_df['IsFabricated'].fillna(0)

fin_df.groupby('email')['non_fab'].sum().describe()

count    114169.000000
mean          7.512976
std           5.773375
min           1.000000
25%           3.000000
50%           6.000000
75%          11.000000
max         344.000000
Name: non_fab, dtype: float64

#### Serious Dataclasses Breached 
Either data that will help you hack your other accounts, e.g., Mother's Maiden Name, or sensitive personal information

In [18]:
serious_dataclasses = ["Audio recordings",
"Auth tokens",
"Bank account numbers",
"Biometric data",
"Browsing histories",
"Chat logs",
"Credit card CVV",
"Credit cards",
"Credit status information",
"Drinking habits",
"Driver's licenses",
"Drug habits",
"Email messages",
"Encrypted keys",
"Government issued IDs",
"Health insurance information",
"Historical passwords", #account cracking
"HIV statuses",
#"Mothers maiden names", #account cracking
"Partial credit card data",
"Passport numbers",
"Password hints",
"Passwords",
"Personal health data",
"Photos",
"PINs",
"Places of birth", #account cracking
"Private messages",
"Security questions and answers",
"Sexual fetishes",
"Sexual orientations",
"SMS messages",
"Social security numbers",
"Taxation records"]

In [19]:
fin_df['serious'] = fin_df['DataClasses'].apply(lambda x: any(string in x for string in serious_dataclasses))

In [20]:
fin_df.groupby('email')['serious'].sum().describe()

count    114169.000000
mean          4.495826
std           4.028408
min           0.000000
25%           2.000000
50%           3.000000
75%           6.000000
max         298.000000
Name: serious, dtype: float64