### Concat all HIBP and Analyze

In [1]:
import glob
import os
import pandas as pd

In [2]:
# Directory path containing JSON files
directory = 'pwned/'

# Get list of JSON file paths in the directory
json_files = glob.glob(directory + '/*.json')

# List to store individual DataFrames
dataframes = []

# Iterate over the JSON files
for file in json_files:
    # Read JSON file into a DataFrame
    df = pd.read_json(file)
    # Append DataFrame to the list
    file_name = os.path.basename(file).rstrip('.json')
    df['file_name'] = file_name

    dataframes.append(df)

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [5]:
combined_df.rename(columns={'file_name': 'email'}, inplace = True)

In [6]:
breaches = pd.read_json('breaches.json')

In [7]:
fin_df = pd.merge(combined_df, breaches, on='Name')
fin_df.shape

(187794, 17)

In [9]:
fin_df.groupby('email').size()

email
000JWA@GMAIL.COM             2
00NMGP@GMAIL.COM             4
0712045SETH@GMAIL.COM        8
0TYLERROLLINS0@GMAIL.COM     1
1006ESP@GMAIL.COM            1
                            ..
zulenny88@yahoo.com          9
zwhitcomb12@gmail.com        4
zydecotrooper@gmail.com     16
zyuranger@yahoo.com         23
zz840_90@msn.com            15
Length: 25844, dtype: int64

In [10]:
fin_df.groupby(['email'])['Name'].nunique().reset_index()['Name'].describe()

count    25844.000000
mean         7.266445
std          5.929493
min          1.000000
25%          3.000000
50%          6.000000
75%         10.000000
max        344.000000
Name: Name, dtype: float64

In [11]:
(fin_df.groupby(['email'])['Name'].nunique().reset_index()['Name'] > 0).mean()

1.0

In [12]:
fin_df['Name'].isna().sum()

0

#### How many non-fabricated breaches?

https://haveibeenpwned.com/FAQs#FabricatedBreach

In [18]:
fin_df['non_fab'] = ~fin_df['IsFabricated'].fillna(0)

fin_df.groupby('email')['non_fab'].sum().describe()

count    25844.000000
mean         7.266445
std          5.929493
min          1.000000
25%          3.000000
50%          6.000000
75%         10.000000
max        344.000000
Name: non_fab, dtype: float64

#### Serious Dataclasses Breached 
Either data that will help you hack your other accounts, e.g., Mother's Maiden Name, or sensitive personal information

In [19]:
serious_dataclasses = ["Audio recordings",
"Auth tokens",
"Bank account numbers",
"Biometric data",
"Browser user agent details",
"Browsing histories",
"Chat logs",
"Credit card CVV",
"Credit cards",
"Credit status information",
"Drinking habits",
"Driver's licenses",
"Drug habits",
"Email messages",
"Encrypted keys",
"Government issued IDs",
"Health insurance information",
"Historical passwords",
"HIV statuses",
"Login histories",
"MAC addresses",
"Mothers maiden names",
"Nationalities",
"Partial credit card data",
"Partial dates of birth",
"Passport numbers",
"Password hints",
"Passwords",
"Personal health data",
"Photos",
"PINs",
"Places of birth",
"Private messages",
"Security questions and answers",
"Sexual fetishes",
"Sexual orientations",
"SMS messages",
"Social security numbers",
"Taxation records"]

In [20]:
fin_df['serious'] = fin_df['DataClasses'].apply(lambda x: any(string in x for string in serious_dataclasses))

In [21]:
fin_df.groupby('email')['serious'].sum().describe()

count    25844.000000
mean         4.321003
std          4.246301
min          0.000000
25%          1.000000
50%          3.000000
75%          6.000000
max        303.000000
Name: serious, dtype: float64

In [None]:
fin_df.groupby('file_name').size().shape