### Concat all HIBP and Analyze

In [1]:
import glob
import os
import pandas as pd

In [2]:
# Directory path containing JSON files
directory = 'pwned/'

# Get list of JSON file paths in the directory
json_files = glob.glob(directory + '/*.json')

# List to store individual DataFrames
dataframes = []

# Iterate over the JSON files
for file in json_files:
    # Read JSON file into a DataFrame
    df = pd.read_json(file)
    # Append DataFrame to the list
    file_name = os.path.basename(file).rstrip('.json')
    df['file_name'] = file_name

    dataframes.append(df)

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [3]:
combined_df

Unnamed: 0,Name,file_name
0,PDL,coxcats1@hotmail.com
1,NotAcxiom,coxcats1@hotmail.com
2,Exactis,Nealfamily5@aol.com
3,LeadHunter,Nealfamily5@aol.com
4,Luxottica,Nealfamily5@aol.com
...,...,...
121640,Scentbird,KHARRISON51@GMAIL.COM
121641,RiverCityMedia,DEVARNEYKENNETH@GMAIL.COM
121642,Houzz,bjdanford@live.com
121643,Cit0day,3.MUNOZ@GMAIL.COM


In [4]:
breaches = pd.read_json('breaches.json')

In [5]:
fin_df = pd.merge(combined_df, breaches, on='Name')
fin_df.shape

(121645, 17)

In [6]:
fin_df.groupby('file_name')['Name'].nunique().describe()

count    16727.000000
mean         7.272374
std          5.573963
min          1.000000
25%          3.000000
50%          6.000000
75%         10.000000
max         59.000000
Name: Name, dtype: float64

#### Let's filter out where breach is supposedly 'fabricated'
https://haveibeenpwned.com/FAQs#FabricatedBreach

In [13]:
fin_df[fin_df['IsFabricated']==False].groupby('file_name')['Name'].nunique().describe()

count    16727.000000
mean         7.272374
std          5.573963
min          1.000000
25%          3.000000
50%          6.000000
75%         10.000000
max         59.000000
Name: Name, dtype: float64

#### Serious Dataclasses Breached 
Either data that will help you hack your other accounts, e.g., Mother's Maiden Name, or sensitive personal information

In [9]:
serious_dataclasses = ["Audio recordings",
"Auth tokens",
"Bank account numbers",
"Biometric data",
"Browser user agent details",
"Browsing histories",
"Chat logs",
"Credit card CVV",
"Credit cards",
"Credit status information",
"Drinking habits",
"Driver's licenses",
"Drug habits",
"Email messages",
"Encrypted keys",
"Government issued IDs",
"Health insurance information",
"Historical passwords",
"HIV statuses",
"Login histories",
"MAC addresses",
"Mothers maiden names",
"Nationalities",
"Partial credit card data",
"Partial dates of birth",
"Passport numbers",
"Password hints",
"Passwords",
"Personal health data",
"Photos",
"PINs",
"Places of birth",
"Private messages",
"Security questions and answers",
"Sexual fetishes",
"Sexual orientations",
"SMS messages",
"Social security numbers",
"Taxation records"]

In [10]:
fin_df['serious'] = fin_df['DataClasses'].apply(lambda x: any(string in x for string in serious_dataclasses))

In [14]:
fin_df[fin_df['serious']==True].groupby('file_name')['Name'].nunique().describe()

count    15277.000000
mean         4.709302
std          3.761341
min          1.000000
25%          2.000000
50%          4.000000
75%          7.000000
max         53.000000
Name: Name, dtype: float64