In [2]:
import pandas as pd
import janitor
import numpy as np
from IPython.display import display
from utilities import clean_email_column_no_dedupe, pandas_to_tex

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_colwidth", None)

import sys

sys.path.append("/home/lsys/pwned_pols//venv/lib/python3.10/site-packages")
import warnings

warnings.filterwarnings("ignore")

## Prep scraped data

In [3]:
df_scraped_emails = (
    pd.read_csv("../data/scraped_pol_combined_legislature_data.csv")
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            #             "ltype",
            "legislature",
            #             "chamber",
            "year",
            "nemail_cc3",
        ]
    )
)
assert (df_scraped_emails["nemail_cc3"] >= 30).all()
df_scraped_emails.head()

Unnamed: 0,email,cc3,country,legislature,year,nemail_cc3,name,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,gender,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,sr_no,photo,contact,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate
0,anja.ninasdotter.abusland@stortinget.no,NOR,Norway,National Legislature,2025,174,"Abusland, Anja Ninasdotter",Senterpartiet,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,grunde.kreken.almeland@stortinget.no,NOR,Norway,National Legislature,2025,174,"Almeland, Grunde",Venstre,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,per-willy.amundsen@stortinget.no,NOR,Norway,National Legislature,2025,174,"Amundsen, Per-Willy",Fremskrittspartiet,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,marit.arnstad@stortinget.no,NOR,Norway,National Legislature,2025,174,"Arnstad, Marit",Senterpartiet,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,henrik.asheim@stortinget.no,NOR,Norway,National Legislature,2025,174,"Asheim, Henrik",Høyre,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Basic numbers

In [4]:
df_scraped_emails["nemail_cc3"].describe()

count    3699.000000
mean     2878.553933
std      1006.081574
min        86.000000
25%      3251.000000
50%      3251.000000
75%      3251.000000
max      3251.000000
Name: nemail_cc3, dtype: float64

In [5]:
df_scraped_emails["legislature"].unique()

array(['National Legislature', 'Bihar Legislature',
       'Tamil Nadu State Legislature', 'UP State Legislature',
       'HP Legislature', 'Delhi Legislature'], dtype=object)

In [6]:
df_scraped_emails["email"].nunique()

3697

In [7]:
df_scraped_emails["cc3"].nunique()

4

In [8]:
df_scraped_emails.groupby(["cc3", "legislature"]).ngroups

9

## Prep EP data

In [9]:
df_ep_emails = (
    pd.read_csv(
        "../data/everypol/everypol_combined_legislature_data.csv", low_memory=False
    )
    .sort_values(["cc3", "leg_start_year", "email"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    #     .dropna(subset=["email"])
    #     .query("email!='-'")
    #     .query("email!='No tiene'")
    # ================================================================
    # Fix missing cc3 for Wales/Scotland
    # Wales, Scotland = GBR
    .assign(
        cc3=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", "GBR", df_["cc3"]),
        )
    )
    # ================================================================
    # Fix missing pop for Wales/Scotland
    .assign(
        pop2024=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", 68556800, df_["pop2024"]),
        )
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    # Perc female
    .assign(
        female_count_cc3=lambda df_: df_.groupby("cc3")["gender"].transform(
            lambda g: (g == "female").sum()
        ),
        female_prop_cc3=lambda df_: df_["female_count_cc3"] / df_["nemail_cc3"],
    )
    # ================================================================
    # Get indicator for years and chambers
    .assign(
        years=lambda df_: df_.groupby("cc3")["leg_start_year"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
        chambers=lambda df_: df_.groupby("cc3")["ltype"]
        .transform(lambda x: ", ".join(map(str, sorted(x.unique()))))
        .replace("legislature", "", regex=True)
        .replace("house", "", regex=True)
        .str.strip()
        .str.title(),
        legislatures=lambda df_: df_.groupby("cc3")["legislature"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
    )
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
            "gender",
        ]
    )
)

assert (df_ep_emails["female_count_cc3"] <= df_ep_emails["nemail_cc3"]).all()
assert (df_ep_emails["female_prop_cc3"] <= 1).all()
assert (df_ep_emails["nemail_cc3"] >= 30).all()
display(df_ep_emails.head(3))
df_ep_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year,female_count_cc3,female_prop_cc3,years,chambers,legislatures
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714,"2009, 2013, 2017",Unicameral,Kuvendi
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714,"2009, 2013, 2017",Unicameral,Kuvendi
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714,"2009, 2013, 2017",Unicameral,Kuvendi


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8512 entries, 0 to 8511
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   email                   8512 non-null   object 
 1   cc3                     8512 non-null   object 
 2   country                 8512 non-null   object 
 3   ltype                   8512 non-null   object 
 4   legislature             8512 non-null   object 
 5   chamber                 8512 non-null   object 
 6   leg_start_year          8512 non-null   int64  
 7   nemail_cc3              8512 non-null   int64  
 8   gender                  7300 non-null   object 
 9   id                      8512 non-null   object 
 10  name                    8512 non-null   object 
 11  sort_name               8512 non-null   object 
 12  twitter                 2393 non-null   object 
 13  facebook                1597 non-null   object 
 14  group                   8510 non-null   

In [10]:
# countries with incomplete gender coverage
for c in df_ep_emails["cc3"].unique().tolist():
    _df = df_ep_emails.query(f"cc3=='{c}'")
    n_gender = len(_df.dropna(subset=["gender"]))
    n_country = len(_df)
    if n_gender < n_country:
        name = _df.reset_index().loc[0, "country"]
        print(f"{name} ({c}):\n {n_gender}, {n_country}")

Albania (ALB):
 61, 140
Armenia (ARM):
 32, 119
Bulgaria (BGR):
 186, 206
Cameroon (CMR):
 81, 109
Colombia (COL):
 55, 169
Estonia (EST):
 95, 101
Georgia (GEO):
 49, 145
Guernsey (GGY):
 17, 39
Guatemala (GTM):
 143, 152
Hungary (HUN):
 171, 184
Iran (IRN):
 51, 138
Italy (ITA):
 284, 314
South-Korea (KOR):
 238, 253
Moldova (MDA):
 43, 44
Macedonia (MKD):
 95, 101
Namibia (NAM):
 12, 72
Nigeria (NGA):
 107, 140
Nicaragua (NIC):
 2, 85
Nepal (NPL):
 47, 268
Papua-New-Guinea (PNG):
 55, 56
Rwanda (RWA):
 1, 76
Slovakia (SVK):
 116, 164
Seychelles (SYC):
 0, 32
Tanzania (TZA):
 384, 405
Uruguay (URY):
 108, 119
South-Africa (ZAF):
 371, 385


In [11]:
set_edomain = set()
for _, row in df_ep_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

481

### Basic numbers

In [12]:
df_ep_emails["nemail_cc3"].describe()

count    8512.000000
mean      284.731203
std       222.100304
min        31.000000
25%       140.000000
50%       208.000000
75%       405.000000
max       835.000000
Name: nemail_cc3, dtype: float64

In [13]:
df_ep_emails["ltype"].unique()

array(['unicameral legislature', 'lower house', 'upper house'],
      dtype=object)

In [14]:
df_ep_emails["legislature"].unique()

array(['Kuvendi', 'Consell General', 'National Assembly',
       'House of Representatives', 'Senate', 'Chamber of Representatives',
       'Parliament', 'House of Commons', 'Assemblée Nationale',
       'Cámara de Representantes', 'Folketing', 'Riigikogu', 'Eduskunta',
       'Parliament of Georgia', 'States', 'Hellenic Parliament',
       'Inatsisartut', 'Congress', 'Legislative Council', 'Országgyűlés',
       'Lok Sabha', 'Majles', 'Chamber of Deputies', 'Parlament',
       'Sobranie', 'National Council', 'Tweede Kamer',
       'Constituent Assembly', 'New Zealand Parliament',
       'National Parliament', 'Assembly', 'House of Assembly',
       'Scottish Parliament', 'National Assembly for Wales'], dtype=object)

In [15]:
df_ep_emails["legislature"].nunique()

34

In [16]:
df_ep_emails["id"].nunique()

8512

In [17]:
df_ep_emails["email"].nunique()

8512

In [18]:
df_ep_emails["cc3"].nunique()

55

In [19]:
df_ep_emails.groupby(["cc3", "legislature"]).ngroups

61

In [20]:
df_ep_emails["female_prop_cc3"].describe()

count    8512.000000
mean        0.228148
std         0.127742
min         0.000000
25%         0.117647
50%         0.250000
75%         0.337725
max         0.543860
Name: female_prop_cc3, dtype: float64

In [169]:
# total
df_scraped_emails["email"].nunique() + df_ep_emails["email"].nunique()

12209

In [172]:
# minus denmark and IND LS common
(
    df_ep_emails.groupby(["cc3", "legislature"]).ngroups
    + df_scraped_emails.groupby(["cc3", "legislature"]).ngroups
    - 2
)

68

In [174]:
df_ep_emails["cc3"].nunique() + df_scraped_emails["cc3"].nunique() - 2

57

## Table1

In [21]:
df_tab1 = (
    df_ep_emails.drop_duplicates(["cc3"], ignore_index=True)
    #     .filter(["cc3", "country", "nemail_cc3", "female_prop_cc3", "years", "chambers", "legislatures", "pop2024"])
    #     .assign(female_prop_cc3=lambda df_: (100 * df_["female_prop_cc3"]).round(1).astype(str) + "\\%")
    .filter(
        ["cc3", "country", "nemail_cc3", "years", "chambers", "legislatures", "pop2024"]
    )
    .assign(
        pop2024=lambda df_: (df_["pop2024"].astype(float) / 1_000_000)
        .round(1)
        .apply(str)
    )
    .replace("Bosnia-and-Herzegovina", "Bosnia")
    .replace(
        "House of Commons, National Assembly for Wales, Scottish Parliament",
        "Commons, Senedd, Scottish Parliament",
    )
    .replace("1997, 2001, 2005, 2007, 2010, 2011, 2015, 2016, 2017", "1997--2017")
    .replace("2004, 2007, 2010, 2013, 2016", "2004--2016")
    .replace("2001, 2005, 2007, 2011, 2015", "2001--2015")
    .replace("2004, 2007, 2009, 2012, 2015", "2004--2015")
    .replace("Lower ", "Lower", regex=True)
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_tab1

Unnamed: 0,ix,cc3,country,nemail_cc3,years,chambers,legislatures,pop2024
0,1,ALB,Albania,140,"2009, 2013, 2017",Unicameral,Kuvendi,2.7
1,2,AND,Andorra,31,2015,Unicameral,Consell General,0.1
2,3,ARM,Armenia,119,2019,Unicameral,National Assembly,2.8
3,4,AUS,Australia,177,2004--2016,"Lower, Upper","House of Representatives, Senate",26.9
4,5,BEL,Belgium,149,2014,Lower,Chamber of Representatives,11.9
5,6,BGR,Bulgaria,206,"2013, 2014, 2017",Unicameral,National Assembly,6.4
6,7,BIH,Bosnia,42,2014,Lower,House of Representatives,3.2
7,8,BLR,Belarus,59,2016,Unicameral,House of Representatives,9.1
8,9,BMU,Bermuda,33,2017,Lower,Parliament,
9,10,BTN,Bhutan,48,2013,Lower,National Assembly,0.8


In [22]:
df_tab1["pop2024"].astype(float).sum()

np.float64(2608.4)

In [23]:
# % coverage of global pop
100 * (df_tab1["pop2024"].astype(float).sum()) / 8000

np.float64(32.605)

In [24]:
pandas_to_tex(
    df_tab1, "../tables/hipb_ep_emailcoverage_summary.tex", index=False, na_rep="---"
)

In [25]:
!cat ../tables/hipb_ep_emailcoverage_summary.tex

\midrule
1 & ALB & Albania & 140 & 2009, 2013, 2017 & Unicameral & Kuvendi & 2.7 \\
2 & AND & Andorra & 31 & 2015 & Unicameral & Consell General & 0.1 \\
3 & ARM & Armenia & 119 & 2019 & Unicameral & National Assembly & 2.8 \\
4 & AUS & Australia & 177 & 2004--2016 & Lower, Upper & House of Representatives, Senate & 26.9 \\
5 & BEL & Belgium & 149 & 2014 & Lower & Chamber of Representatives & 11.9 \\
6 & BGR & Bulgaria & 206 & 2013, 2014, 2017 & Unicameral & National Assembly & 6.4 \\
7 & BIH & Bosnia & 42 & 2014 & Lower & House of Representatives & 3.2 \\
8 & BLR & Belarus & 59 & 2016 & Unicameral & House of Representatives & 9.1 \\
9 & BMU & Bermuda & 33 & 2017 & Lower & Parliament & nan \\
10 & BTN & Bhutan & 48 & 2013 & Lower & National Assembly & 0.8 \\
11 & CAN & Canada & 432 & 2011, 2015 & Lower, Upper & House of Commons, Senate & 40.4 \\
12 & CMR & Cameroon & 109 & 2013 & Lower & Assemblée Nationale & 29.4 \\
13 & COL & Colombia & 169 & 2014, 2018 & Lower & Cámara 

## Merge to HIBP

### Scraped data

In [33]:
df_scraped_email_breach_expanded = (
    df_scraped_emails.filter(["email", "cc3", "country", "year", "nemail_cc3"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/scraped_pol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    .dropna(subset=["breach"])
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    .query("present==True")
    .reset_index(drop=True)
)
df_scraped_email_breach_expanded.head()

Unnamed: 0,email,cc3,country,year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Places of birth,Car ownership statuses,Drinking habits,Security questions and answers,Loyalty program details,Home ownership statuses,Mothers maiden names,Instant messenger identities,Mnemonic phrases,Partial credit card data,Eating habits,Relationship statuses,Customer interactions,Clothing sizes,Vehicle identification numbers (VINs),Deceased statuses,Address book contacts,Drug habits,Credit card CVV,Age groups,Loan information,Physical attributes,IMSI numbers,Usernames,Social connections,HIV statuses,Time zones,Job titles,Email messages,Parenting plans,Fitness levels,Net worths,Purchases,Astrological signs,Customer feedback,Family members' names,Names,Social security numbers,Historical passwords,Job applications,Religions,Races,Personal interests,Passport numbers,Employers,Marital statuses,Work habits,MAC addresses,Occupations,Social media profiles,Delivery instructions,Utility bills,Ages,IP addresses,Payment histories,Vehicle details,Travel habits,Homepage URLs,...,User statuses,Flights taken,Financial transactions,Partial phone numbers,Taxation records,PINs,Nicknames,Salutations,Survey results,Personal health data,Income levels,Telecommunications carrier,Comments,Device usage tracking data,Audio recordings,Login histories,Government issued IDs,Device serial numbers,Genders,Tattoo status,School grades (class levels),Political views,Employment statuses,Cellular network names,SMS messages,Family structure,Account balances,Credit cards,Spoken languages,Career levels,Avatars,Encrypted keys,Apps installed on devices,Device information,Political donations,Phone numbers,Nationalities,Driver's licenses,Bios,Ethnicities,Credit status information,Geographic locations,Photos,Email addresses,Website activity,Dates of birth,Charitable donations,Living costs,Payment methods,Beauty ratings,Sexual orientations,Private messages,Password hints,Sexual fetishes,Smoking habits,Personal descriptions,Browser user agent details,Bank account numbers,Reward program balances,Citizenship statuses,Profile photos,Cryptocurrency wallet addresses,Passwords,Purchasing habits,Browsing histories,Deceased date,Years of professional experience,Buying preferences,Licence plates,Chat logs,User website URLs,Financial investments,Password strengths,Spouses names,Support tickets
0,per-willy.amundsen@stortinget.no,NOR,Norway,2025,174,Cit0day,True,226883414.0,False,False,False,False,False,False,False,0.041096,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,henrik.asheim@stortinget.no,NOR,Norway,2025,174,Foodora,True,582578.0,True,False,False,False,False,False,False,4.153425,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,jorodd.asphjell@stortinget.no,NOR,Norway,2025,174,Cit0day,True,226883414.0,False,False,False,False,False,False,False,0.041096,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,jorodd.asphjell@stortinget.no,NOR,Norway,2025,174,DemandScience,True,121796165.0,True,False,False,False,False,False,False,0.709589,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,jorodd.asphjell@stortinget.no,NOR,Norway,2025,174,LinkedInScrape,True,125698496.0,True,False,False,False,False,False,False,0.484932,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### EP data

In [28]:
df_ep_email_breach_expanded = (
    # All EP emails
    df_ep_emails.filter(
        [
            "email",
            "gender",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
        ]
    )
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/everypol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    .dropna(subset=["breach"])
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    .query("present==True")
    .reset_index(drop=True)
)
df_ep_email_breach_expanded.head()

Unnamed: 0,email,gender,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Places of birth,Car ownership statuses,Drinking habits,Security questions and answers,Loyalty program details,Home ownership statuses,Mothers maiden names,Instant messenger identities,Mnemonic phrases,Partial credit card data,Eating habits,Relationship statuses,Customer interactions,Clothing sizes,Vehicle identification numbers (VINs),Deceased statuses,Address book contacts,Drug habits,Credit card CVV,Age groups,Loan information,Physical attributes,IMSI numbers,Usernames,Social connections,HIV statuses,Time zones,Job titles,Email messages,Parenting plans,Fitness levels,Net worths,Purchases,Astrological signs,Customer feedback,Family members' names,Names,Social security numbers,Historical passwords,Job applications,Religions,Races,Personal interests,Passport numbers,Employers,Marital statuses,Work habits,MAC addresses,Occupations,Social media profiles,Delivery instructions,Utility bills,Ages,IP addresses,...,User statuses,Flights taken,Financial transactions,Partial phone numbers,Taxation records,PINs,Nicknames,Salutations,Survey results,Personal health data,Income levels,Telecommunications carrier,Comments,Device usage tracking data,Audio recordings,Login histories,Government issued IDs,Device serial numbers,Genders,Tattoo status,School grades (class levels),Political views,Employment statuses,Cellular network names,SMS messages,Family structure,Account balances,Credit cards,Spoken languages,Career levels,Avatars,Encrypted keys,Apps installed on devices,Device information,Political donations,Phone numbers,Nationalities,Driver's licenses,Bios,Ethnicities,Credit status information,Geographic locations,Photos,Email addresses,Website activity,Dates of birth,Charitable donations,Living costs,Payment methods,Beauty ratings,Sexual orientations,Private messages,Password hints,Sexual fetishes,Smoking habits,Personal descriptions,Browser user agent details,Bank account numbers,Reward program balances,Citizenship statuses,Profile photos,Cryptocurrency wallet addresses,Passwords,Purchasing habits,Browsing histories,Deceased date,Years of professional experience,Buying preferences,Licence plates,Chat logs,User website URLs,Financial investments,Password strengths,Spouses names,Support tickets
0,edmond.spaho@parlament.al,male,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,Nitro,True,77159696.0,True,False,False,False,False,False,False,0.309589,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,pepbardina@gmail.com,male,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,MyHeritage,True,91991358.0,True,False,False,False,False,False,False,1.320548,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Dailymotion,True,85176234.0,True,False,False,False,False,False,False,0.79726,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Dropbox,True,68648009.0,True,False,False,False,False,False,False,4.169863,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Evite,True,100985047.0,True,False,False,False,False,False,False,5.926027,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Manually check

In [None]:
# non_breached_emails = []
# for pol in df["email"].unique():
#     if pol not in df_ep_email_breach_expanded["email"].unique().tolist():
#         non_breached_emails.append(pol)

In [None]:
# import random
# random.seed(42)

# # Sample 10 unique emails
# sampled_emails = random.sample(non_breached_emails, 20)
# sampled_emails

Paste (no breach) 

* g.akriotis@parliament.gr

### Combine

In [137]:
LIST_SERIOUS_DATACLASSES = [
    "Audio recordings",
    "Auth tokens",
    "Bank account numbers",
    "Biometric data",
    "Browsing histories",
    "Chat logs",
    "Credit card CVV",
    "Credit cards",
    "Credit status information",
    "Drinking habits",
    "Driver's licenses",
    "Drug habits",
    "Email messages",
    "Encrypted keys",
    "Government issued IDs",
    "Health insurance information",
    "Historical passwords",
    "HIV statuses",
    "Login histories",
    "MAC addresses",
    "Mothers maiden names",
    "Nationalities",
    "Partial credit card data",
    "Partial dates of birth",
    "Passport numbers",
    "Password hints",
    "Passwords",
    "Personal health data",
    "Photos",
    "PINs",
    "Places of birth",
    "Private messages",
    "Security questions and answers",
    "Sexual fetishes",
    "Sexual orientations",
    "SMS messages",
    "Social security numbers",
    "Taxation records",
]

In [176]:
df_email_breach_expanded = (
    pd.concat(
        [
            df_ep_email_breach_expanded.remove_columns(
                ["gender", "country", "ltype", "chamber", "legislature"]
            ).rename_column("leg_start_year", "year"),
            df_scraped_email_breach_expanded.remove_columns(["country"]),
        ],
        ignore_index=True,
    )
    .fillna(0)
    .assign(
        seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES].max(axis=1).astype(int)
    )
)
df_email_breach_expanded

Unnamed: 0,email,cc3,year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Places of birth,Car ownership statuses,Drinking habits,Security questions and answers,Loyalty program details,Home ownership statuses,Mothers maiden names,Instant messenger identities,Mnemonic phrases,Partial credit card data,Eating habits,Relationship statuses,Customer interactions,Clothing sizes,Vehicle identification numbers (VINs),Deceased statuses,Address book contacts,Drug habits,Credit card CVV,Age groups,Loan information,Physical attributes,IMSI numbers,Usernames,Social connections,HIV statuses,Time zones,Job titles,Email messages,Parenting plans,Fitness levels,Net worths,Purchases,Astrological signs,Customer feedback,Family members' names,Names,Social security numbers,Historical passwords,Job applications,Religions,Races,Personal interests,Passport numbers,Employers,Marital statuses,Work habits,MAC addresses,Occupations,Social media profiles,Delivery instructions,Utility bills,Ages,IP addresses,Payment histories,Vehicle details,Travel habits,Homepage URLs,Recovery email addresses,...,Flights taken,Financial transactions,Partial phone numbers,Taxation records,PINs,Nicknames,Salutations,Survey results,Personal health data,Income levels,Telecommunications carrier,Comments,Device usage tracking data,Audio recordings,Login histories,Government issued IDs,Device serial numbers,Genders,Tattoo status,School grades (class levels),Political views,Employment statuses,Cellular network names,SMS messages,Family structure,Account balances,Credit cards,Spoken languages,Career levels,Avatars,Encrypted keys,Apps installed on devices,Device information,Political donations,Phone numbers,Nationalities,Driver's licenses,Bios,Ethnicities,Credit status information,Geographic locations,Photos,Email addresses,Website activity,Dates of birth,Charitable donations,Living costs,Payment methods,Beauty ratings,Sexual orientations,Private messages,Password hints,Sexual fetishes,Smoking habits,Personal descriptions,Browser user agent details,Bank account numbers,Reward program balances,Citizenship statuses,Profile photos,Cryptocurrency wallet addresses,Passwords,Purchasing habits,Browsing histories,Deceased date,Years of professional experience,Buying preferences,Licence plates,Chat logs,User website URLs,Financial investments,Password strengths,Spouses names,Support tickets,seriousbreach
0,edmond.spaho@parlament.al,ALB,2009,140,Nitro,True,77159696.0,True,False,False,False,False,False,False,0.309589,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,pepbardina@gmail.com,AND,2015,31,MyHeritage,True,91991358.0,True,False,False,False,False,False,False,1.320548,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,rgilicasals@gmail.com,AND,2015,31,Dailymotion,True,85176234.0,True,False,False,False,False,False,False,0.797260,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,rgilicasals@gmail.com,AND,2015,31,Dropbox,True,68648009.0,True,False,False,False,False,False,False,4.169863,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,rgilicasals@gmail.com,AND,2015,31,Evite,True,100985047.0,True,False,False,False,False,False,False,5.926027,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12116,ajaymahawar20@gmail.com,IND,2025,3251,ExploitIn,True,593427119.0,False,False,False,False,False,False,False,0.561644,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12117,ajaymahawar20@gmail.com,IND,2025,3251,RailYatri,True,23209732.0,True,False,False,False,False,False,False,0.942466,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12118,ajaymahawar20@gmail.com,IND,2025,3251,Twitter200M,True,211524284.0,True,False,False,False,False,False,False,2.010959,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12119,gopalrai1975@gmail.com,IND,2025,3251,db8151dd,True,22802117.0,True,False,False,False,False,False,False,0.232877,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Basic numbers

In [56]:
# pols involved in 559 data breaches
df_email_breach_expanded["breach"].nunique()

559

In [57]:
# 3501 pols have been breached
n_pwned_pols = df_email_breach_expanded["email"].nunique()
n_pwned_pols

3501

In [61]:
100 * n_pwned_pols / (
    df_ep_emails["email"].nunique() + df_scraped_emails["email"].nunique()
)

28.675567204521254

In [70]:
# 2244 pols have been breached multiple times
n_multi_pwned_pols = (
    df_email_breach_expanded.groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols

2244

In [71]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols / n_pwned_pols)

64.09597257926308

In [74]:
# Proportion of total
100 * n_multi_pwned_pols / (
    df_ep_emails["email"].nunique() + df_scraped_emails["email"].nunique()
)

18.379883692358096

In [84]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [78]:
100 * _n / n_pwned_pols

65.89545844044558

In [82]:
# How many had serious breaches
df_email_breach_expanded.query("seriousbreach==1")["email"].nunique()

2373

In [85]:
# How many had multiple serious breaches
(
    df_email_breach_expanded.query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)

1247

### Tabulate: Compromised data types

In [86]:
df_pwnpol_datatype = (
    df_email_breach_expanded.query("present==True")
    # ===================================================
    # Filter data types and remove duplicate breach info
    .set_index("email")
    .iloc[:, 20:]
    .reset_index()
    .drop_duplicates(ignore_index=True)
    # ===================================================
    # Long by email-datatype
    .melt(id_vars=["email"], var_name="datatype", value_name="present")
    .groupby(["email", "datatype"])["present"]
    .sum()
    .reset_index()
    .assign(present=lambda df_: np.where(df_["present"] > 0, 1, 0))
    # ===================================================
    # Back to wide by email
    .pivot(index="email", columns="datatype", values="present")
    # ===================================================
    .T.sum(axis=1)
    .reset_index(name="count")
    .sort_values(["count", "datatype"], ascending=[False, True], ignore_index=True)
    .assign(percent=lambda df_: 100 * df_["count"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}%"))
)
df_pwnpol_datatype

Unnamed: 0,datatype,count,percent
0,Email addresses,3494,99.8%
1,Names,2962,84.6%
2,Phone numbers,2710,77.4%
3,Job titles,2460,70.3%
4,seriousbreach,2373,67.8%
5,Passwords,2308,65.9%
6,Social media profiles,2289,65.4%
7,Physical addresses,2267,64.8%
8,Geographic locations,1776,50.7%
9,Employers,1582,45.2%


In [87]:
df_datatype1 = (
    df_pwnpol_datatype.iloc[:40]
    .reset_index(drop=True)
    .assign(ix=range(1, 41))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype2 = (
    df_pwnpol_datatype.iloc[40:74]
    .reset_index(drop=True)
    .assign(ix=range(41, 75))
    .astype(str)
    .reorder_columns(["ix"])
)
df2tex = pd.concat([df_datatype1, df_datatype2], axis=1)
# print(df2tex.to_latex(na_rep="", index=False))
df2tex

Unnamed: 0,ix,datatype,count,percent,ix.1,datatype.1,count.1,percent.1
0,1,Email addresses,3494,99.8%,41.0,Survey results,30.0,0.9%
1,2,Names,2962,84.6%,42.0,Bank account numbers,29.0,0.8%
2,3,Phone numbers,2710,77.4%,43.0,Payment histories,28.0,0.8%
3,4,Job titles,2460,70.3%,44.0,Nationalities,27.0,0.8%
4,5,seriousbreach,2373,67.8%,45.0,Telecommunications carrier,25.0,0.7%
5,6,Passwords,2308,65.9%,46.0,Deceased statuses,18.0,0.5%
6,7,Social media profiles,2289,65.4%,47.0,Private messages,18.0,0.5%
7,8,Physical addresses,2267,64.8%,48.0,Relationship statuses,17.0,0.5%
8,9,Geographic locations,1776,50.7%,49.0,Professional skills,14.0,0.4%
9,10,Employers,1582,45.2%,50.0,Website activity,14.0,0.4%


In [88]:
pandas_to_tex(
    df2tex, "../tables/hipb_pwnpols_datatypes.tex", na_rep="", index=False, escape=True
)

!cat "../tables/hipb_pwnpols_datatypes.tex"

\midrule
1 & Email addresses & 3494 & 99.8\% & 41 & Survey results & 30 & 0.9\% \\
2 & Names & 2962 & 84.6\% & 42 & Bank account numbers & 29 & 0.8\% \\
3 & Phone numbers & 2710 & 77.4\% & 43 & Payment histories & 28 & 0.8\% \\
4 & Job titles & 2460 & 70.3\% & 44 & Nationalities & 27 & 0.8\% \\
5 & seriousbreach & 2373 & 67.8\% & 45 & Telecommunications carrier & 25 & 0.7\% \\
6 & Passwords & 2308 & 65.9\% & 46 & Deceased statuses & 18 & 0.5\% \\
7 & Social media profiles & 2289 & 65.4\% & 47 & Private messages & 18 & 0.5\% \\
8 & Physical addresses & 2267 & 64.8\% & 48 & Relationship statuses & 17 & 0.5\% \\
9 & Geographic locations & 1776 & 50.7\% & 49 & Professional skills & 14 & 0.4\% \\
10 & Employers & 1582 & 45.2\% & 50 & Website activity & 14 & 0.4\% \\
11 & Genders & 1478 & 42.2\% & 51 & Credit cards & 13 & 0.4\% \\
12 & Dates of birth & 1398 & 39.9\% & 52 & Support tickets & 10 & 0.3\% \\
13 & IP addresses & 1379 & 39.4\% & 53 & Passport numbers & 9 & 0.3\% \\
1

### Tabulate: Breaches

In [177]:
LIST_ALL_DATACLASSES = df_email_breach_expanded.iloc[:, 16:-1].columns.tolist()

In [239]:
df_pwnpol_breach_incident = (
    df_email_breach_expanded
    # ===================================================
    .groupby(["breach"])["email"]
    .size()
    .reset_index()
    .rename_column("email", "emails")
    .sort_values("emails", ascending=False, ignore_index=True)
    .head(50)
    .assign(percent=lambda df_: 100 * df_["emails"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\%"))
    # ===================================================
    # Merge back to get breach characteristics
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
            .assign(
                seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES]
                .max(axis=1)
                .astype(int)
            )
            .assign(
                seriousbreach=lambda df_: np.where(
                    df_["seriousbreach"] == 1, r"\checkmark", ""
                )
            )
            #             .filter(["breach", "breachdate", "addeddate", "yearstopublic", "n_dataclasses", "seriousbreach", *LIST_ALL_DATACLASSES])
            .filter(
                [
                    "breach",
                    "breachdate",
                    "addeddate",
                    "yearstopublic",
                    "pwncount",
                    "n_dataclasses",
                    "seriousbreach",
                ]
            )
        ),
        how="left",
        on="breach",
        validate="1:1",
    )
    # ===================================================
    # Remove timestamps
    .assign(
        breachdate=lambda df_: pd.to_datetime(df_["breachdate"]).dt.date,
        addeddate=lambda df_: pd.to_datetime(df_["addeddate"]).dt.date,
    )
    .assign(pwncount=lambda df_: df_["pwncount"] / 1_000_000)
    .round(1)
    .astype(str)
    .assign(pwncount=lambda df_: df_["pwncount"] + "M")
    .assign(yearstopublic=lambda df_: df_["yearstopublic"] + " years")
    # ===================================================
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_pwnpol_breach_incident

Unnamed: 0,ix,breach,emails,percent,breachdate,addeddate,yearstopublic,pwncount,n_dataclasses,seriousbreach
0,1,db8151dd,1471,42.0\%,2020-02-20,2020-05-15,0.2 years,22.8M,6,
1,2,OnlinerSpambot,1157,33.0\%,2017-08-28,2017-08-29,0.0 years,711.5M,2,\checkmark
2,3,PDL,954,27.2\%,2019-10-16,2019-11-22,0.1 years,622.2M,7,
3,4,VerificationsIO,927,26.5\%,2019-02-25,2019-03-09,0.0 years,763.1M,10,
4,5,LinkedIn,447,12.8\%,2012-05-05,2016-05-21,4.0 years,164.6M,2,\checkmark
5,6,LinkedInScrape,328,9.4\%,2021-04-08,2021-10-02,0.5 years,125.7M,7,
6,7,Apollo,298,8.5\%,2018-07-23,2018-10-05,0.2 years,125.9M,8,
7,8,Intelimost,279,8.0\%,2019-03-10,2019-04-02,0.1 years,3.1M,2,\checkmark
8,9,Twitter200M,262,7.5\%,2021-01-01,2023-01-05,2.0 years,211.5M,4,
9,10,Cit0day,247,7.1\%,2020-11-04,2020-11-19,0.0 years,226.9M,2,\checkmark


In [240]:
pandas_to_tex(
    df_pwnpol_breach_incident.head(25),
    "../tables/hipb_pwnpols_breach_incidents.tex",
    escape=False,
)

In [241]:
!cat ../tables/hipb_pwnpols_breach_incidents.tex

\midrule
1 & db8151dd & 1471 & 42.0\% & 2020-02-20 & 2020-05-15 & 0.2 years & 22.8M & 6 &  \\
2 & OnlinerSpambot & 1157 & 33.0\% & 2017-08-28 & 2017-08-29 & 0.0 years & 711.5M & 2 & \checkmark \\
3 & PDL & 954 & 27.2\% & 2019-10-16 & 2019-11-22 & 0.1 years & 622.2M & 7 &  \\
4 & VerificationsIO & 927 & 26.5\% & 2019-02-25 & 2019-03-09 & 0.0 years & 763.1M & 10 &  \\
5 & LinkedIn & 447 & 12.8\% & 2012-05-05 & 2016-05-21 & 4.0 years & 164.6M & 2 & \checkmark \\
6 & LinkedInScrape & 328 & 9.4\% & 2021-04-08 & 2021-10-02 & 0.5 years & 125.7M & 7 &  \\
7 & Apollo & 298 & 8.5\% & 2018-07-23 & 2018-10-05 & 0.2 years & 125.9M & 8 &  \\
8 & Intelimost & 279 & 8.0\% & 2019-03-10 & 2019-04-02 & 0.1 years & 3.1M & 2 & \checkmark \\
9 & Twitter200M & 262 & 7.5\% & 2021-01-01 & 2023-01-05 & 2.0 years & 211.5M & 4 &  \\
10 & Cit0day & 247 & 7.1\% & 2020-11-04 & 2020-11-19 & 0.0 years & 226.9M & 2 & \checkmark \\
11 & Collection1 & 241 & 6.9\% & 2019-01-07 & 2019-01-16 & 0.0 years & 772.9M 