### Analysis of Everypol (after JSON to pd)

In [2]:
import os
import pandas as pd
import json

In [4]:
# Folder containing JSON files
json_folder = "../data/everypol_hibp"

# Initialize an empty set to collect all unique names
all_names = set()

# Iterate over JSON files to extract all unique names
for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            data = json.load(file)
            all_names.update(entry["Name"] for entry in data)

# Convert the set of all names to a sorted list
all_names = sorted(all_names)

# Initialize an empty DataFrame with columns for all names and filenames
df = pd.DataFrame(columns=["Filename"] + all_names)

# Populate the DataFrame with boolean values
for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            data = json.load(file)
            present_names = {entry["Name"] for entry in data}
        
        # Create a row with filename and boolean values for each name
        row = {"Filename": filename.replace(".json", "")}
        row.update({name: name in present_names for name in all_names})
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

In [5]:
df.head()

Unnamed: 0,Filename,123RF,2844Breaches,8fit,ABFRL,AKP,ActMobile,Acuity,Adapt,Adobe,...,Win7Vista,Yam,Yatra,YouveBeenScraped,Zomato,Zynga,bigbasket,db8151dd,iMesh,ixigo
0,sadhvisavitrib.foole@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,helen.hayes.mp@parliament.uk,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,rmsingh@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,solaadeyeye@yahoo.com,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,marcus.jones.mp@parliament.uk,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [8]:
long_df = pd.melt(
    df,
    id_vars=['Filename'],
    var_name='Breach',
    value_name='Present'
)
long_df.shape

(127693, 3)

In [9]:
df.to_csv("../data/everypol_hibp.csv", index=False)