### Analysis of Everypol (after JSON to pd)

In [1]:
import os
import pandas as pd
import json

In [3]:
json_folder = "../data/everypol/everypol_hibp"

all_names = set()

for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            data = json.load(file)
            all_names.update(entry["Name"] for entry in data)

all_names = sorted(all_names)

df = pd.DataFrame(columns=["Filename"] + all_names)

for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            data = json.load(file)
            present_names = {entry["Name"] for entry in data}
        
        row = {"Filename": filename.replace(".json", "")}
        row.update({name: name in present_names for name in all_names})
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

In [4]:
df.head()

Unnamed: 0,Filename,000webhost,123RF,2844Breaches,500px,8fit,ABFRL,AKP,ActMobile,Acuity,...,Zacks,Zomato,Zynga,bigbasket,db8151dd,digiDirect,eThekwiniMunicipality,iMesh,ixigo,piZap
0,wouter.raskin@dekamer.be,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,lcoffice@dphk.org,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
long_df = pd.melt(
    df,
    id_vars=['Filename'],
    var_name='Breach',
    value_name='Present'
)
long_df.shape

(1912288, 3)

In [6]:
long_df.head()

Unnamed: 0,Filename,Breach,Present
0,wouter.raskin@dekamer.be,000webhost,False
1,k.verhoeven@tweedekamer.nl,000webhost,False
2,lcoffice@dphk.org,000webhost,False
3,pk.sreemathi@sansad.nic.in,000webhost,False
4,edurubio@parlamento.gub.uy,000webhost,False


In [7]:
long_df.to_csv("../data/everypol_hibp.csv", index=False)

In [8]:
breaches = pd.read_csv("../data/hipb_01_2025_breaches_data.csv")
breaches.head()

Unnamed: 0,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
1,123RF,123RF,123rf.com,2020-03-22,2020-11-15T00:59:50Z,2020-11-15T01:07:10Z,8661578,"In March 2020, the stock photo site <a href=""h...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
2,126,126,126.com,2012-01-01,2016-10-08T07:46:05Z,2016-10-08T07:46:05Z,6414191,"In approximately 2012, it's alleged that the C...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords']",False,False,False,False,False,False,False
3,17Media,17,17app.co,2016-04-19,2016-07-08T01:55:03Z,2016-07-08T01:55:03Z,4009640,"In April 2016, customer data obtained from the...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Device information', 'Email addresses', 'IP ...",True,False,False,False,False,False,False
4,17173,17173,17173.com,2011-12-28,2018-04-28T04:53:15Z,2018-04-28T04:53:15Z,7485802,"In late 2011, <a href=""https://news.softpedia....",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords', 'Usernames']",False,False,False,False,False,False,False


In [9]:
merged_df = long_df.merge(breaches, left_on="Breach", right_on="Name", how="left")
merged_df.head()

Unnamed: 0,Filename,Breach,Present,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,wouter.raskin@dekamer.be,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
2,lcoffice@dphk.org,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False


In [10]:
boolean_columns = merged_df.select_dtypes(include=['bool'])
grouped_bool_counts = boolean_columns.groupby(df["Filename"]).sum().reset_index()
grouped_bool_counts.describe()

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
count,8537.0,8537.0,8537.0,8537.0,8537.0,8537.0,8537.0
mean,1.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
grouped_bool_counts.head()

Unnamed: 0,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,(i)joice.george@sansad.nic.in,1,0,0,0,0,0,0
1,-,1,0,0,0,0,0,0
2,1.office@bjpanda.org,1,0,0,0,0,0,0
3,101bbb@naver.com,1,0,0,0,0,0,0
4,2016kimkj@gmail.com,1,0,0,0,0,0,0


In [13]:
pol_dat = pd.read_csv("../data/everypol/everypol_combined_legislature_data.csv", low_memory=False)
pol_dat.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019


In [18]:
pol_dat.shape

(25087, 35)

In [14]:
pol_hibp = pol_dat.merge(grouped_bool_counts, left_on="email", right_on="Filename", how="left")
pol_hibp.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,pop2024,lastmod_year,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,1441720000.0,2019,shivajirao@sansad.nic.in,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,1441720000.0,2019,adhikari.deepak@sansad.nic.in,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,1441720000.0,2019,sisiradhikari76@yahoo.com,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,1441720000.0,2019,adhikari.suvendu@sansad.nic.in,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,1441720000.0,2019,yogi.adityanath@sansad.nic.in,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pol_hibp.shape

(25087, 43)

In [16]:
pol_hibp.describe().round(3)

Unnamed: 0,row_id,leg_start_year,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
count,25087.0,25087.0,25087.0,25087.0,25087.0,25087.0,24356.0,25087.0,13074.0,13074.0,13074.0,13074.0,13074.0,13074.0,13074.0
mean,260.045,2011.919,328.801,172.195,750.701,1556683000.0,67864390.0,2018.997,1.0,0.0,0.0,0.0,0.0,0.0,0.0
std,164.707,4.832,191.351,149.959,536.158,1595232.0,210613100.0,0.054,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1997.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,119.0,2010.0,171.0,66.0,290.0,1555978000.0,6378654.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,229.0,2013.0,299.0,121.0,554.0,1557278000.0,26889130.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,385.0,2015.0,432.0,229.0,1437.0,1557812000.0,68556800.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,749.0,2019.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
columns_to_fill = ['IsVerified', 'IsFabricated', 'IsSensitive', 
                   'IsRetired', 'IsSpamList', 'IsMalware', 'IsSubscriptionFree']
pol_hibp[columns_to_fill] = pol_hibp[columns_to_fill].fillna(0)
list(pol_hibp)

['id',
 'name',
 'sort_name',
 'email',
 'twitter',
 'facebook',
 'group',
 'group_id',
 'area_id',
 'area',
 'chamber',
 'term_x',
 'start_date',
 'end_date',
 'image',
 'gender',
 'wikidata',
 'wikidata_group',
 'wikidata_area',
 'row_id',
 'country',
 'legislature',
 'term_y',
 'leg_start_year',
 'row_count',
 'n_unique_emails',
 'person_count_legistype',
 'ltype',
 'url',
 'cc',
 'leg_start_date',
 'lastmod',
 'cc3',
 'pop2024',
 'lastmod_year',
 'Filename',
 'IsVerified',
 'IsFabricated',
 'IsSensitive',
 'IsRetired',
 'IsSpamList',
 'IsMalware',
 'IsSubscriptionFree']

In [20]:
pol_hibp.groupby(['country', 'legislature', 'leg_start_year']).mean(numeric_only=True).reset_index().describe().round(3)

Unnamed: 0,leg_start_year,row_id,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
count,114.0,114.0,114.0,114.0,114.0,114.0,105.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,2012.754,348.467,214.199,116.232,478.281,1556414000.0,38874960.0,2018.982,0.564,0.0,0.0,0.0,0.0,0.0,0.0
std,4.076,187.531,157.149,109.497,442.561,2479202.0,143128200.0,0.132,0.304,0.0,0.0,0.0,0.0,0.0,0.0
min,1997.0,1.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,0.027,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,196.5,111.0,51.25,171.75,1555979000.0,5262079.0,2019.0,0.299,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,365.5,170.0,73.0,292.0,1557280000.0,10319100.0,2019.0,0.527,0.0,0.0,0.0,0.0,0.0,0.0
75%,2016.0,488.75,297.0,142.25,606.0,1557718000.0,40409180.0,2019.0,0.855,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,749.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
